From 3c180e83c53f7be1b996ad97cc6102888a8aae91 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Tue, 2 Aug 2022 10:48:14 +0200
Subject: [PATCH 01/56] Start work on a CSV loader.

---
 opencog/persist/README.md      |  7 ++++
 opencog/persist/csv/load_csv.h | 62 ++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 opencog/persist/csv/load_csv.h

diff --git a/opencog/persist/README.md b/opencog/persist/README.md
index e40197a653..81de0b5ef6 100644
--- a/opencog/persist/README.md
+++ b/opencog/persist/README.md
@@ -17,6 +17,13 @@ Local subdirectories include:
               for RocksDB and one that allows AtomSpaces to trade
               Atoms over the network.)
 
+* csv      -- Load Values from CSV/TSV files. Each column in the CSV
+              table is loaded into an appropriate Value (`FloatValue`,
+              `BoolValue` or `StringValue`). The values are placed
+              under keys (named after the column) on the provided Atom.
+              This is intended for the ASMOSES subsystem, which
+              naturally operates on tables or streams of data.
+
 * file     -- Read and write files containing Atomese s-expressions.
               Provides both a `FileStorageNode`, and also some utilities
               to read files, and dump Atomspace contents to files or
diff --git a/opencog/persist/csv/load_csv.h b/opencog/persist/csv/load_csv.h
new file mode 100644
index 0000000000..6a2f2b45ae
--- /dev/null
+++ b/opencog/persist/csv/load_csv.h
@@ -0,0 +1,62 @@
+/** load_csv.h ---
+ *
+ * Copyright (C) 2018 OpenCog Foundation
+ * Copyright (C) 2022 Linas Vepstas
+ *
+ * Author: Yidnekachew Wondimu <searchyidne@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License v3 as
+ * published by the Free Software Foundation and including the exceptions
+ * at http://opencog.org/wiki/Licenses
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _ATOMESE_LOAD_CSV_H
+#define _ATOMESE_LOAD_CSV_H
+
+namespace opencog {
+
+/**
+ * Load columns from a CSV file and place them into Atomese Values on
+ * the indicated Atom. Atomese Values are vectors (of floats, bools,
+ * srings, or more complex structures). Each Value holds one column
+ * from the dataset. 
+ *
+ * The features (columns) specified in ignore_features will be omitted
+ * from the representation.
+ *
+ * For example, a CSV dataset like this:
+ * o, i1, i2, i3, i4
+ * 1, 0, 0, 3.3, "foo"
+ * 0, 1, 0, 4.4, "bar"
+ *
+ * will be loaded as the following key-value pairs on the `anchor` Atom:
+ * (Predicate "*-column-names-*") (StringValue "o", "i1", "i2", "i3", "i4")
+ * (Predicate "o") (BoolValue 1 0)
+ * (Predicate "i1") (BoolValue 0 1)
+ * (Predicate "i2") (BoolValue 0 0)
+ * (Predicate "i3") (FloatValue 3.3 4.4)
+ * (Predicate "i4") (StringValue "foo" "bar")
+ *
+ * @param file_name
+ * @param ignore_features
+ * @return
+ */
+void load_csv_table(
+	const Handle& anchor,
+	const std::string& file_name,
+	const std::vector<std::string>& ignore_features=std::vector<std::string>());
+
+} // end namespace opencog
+
+#endif //_ATOMESE_LOAD_CSV_H

From f50a2038d152d5bf90a9c2c7b411bee223d44498 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Tue, 2 Aug 2022 10:54:14 +0200
Subject: [PATCH 02/56] initial scaffolding for csv tables

---
 opencog/persist/csv/load_csv.cc | 55 +++++++++++++++++++++++++++++++++
 opencog/persist/csv/load_csv.h  | 31 +++----------------
 2 files changed, 59 insertions(+), 27 deletions(-)
 create mode 100644 opencog/persist/csv/load_csv.cc

diff --git a/opencog/persist/csv/load_csv.cc b/opencog/persist/csv/load_csv.cc
new file mode 100644
index 0000000000..f69885c879
--- /dev/null
+++ b/opencog/persist/csv/load_csv.cc
@@ -0,0 +1,55 @@
+/** 
+ * load_csv.cc -- Load CSV tables into Values
+ *
+ * Copyright (C) 2022 Linas Vepstas
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License v3 as
+ * published by the Free Software Foundation and including the exceptions
+ * at http://opencog.org/wiki/Licenses
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+using namespace opencog;
+
+/**
+ * Load columns from a CSV file and place them into Atomese Values on
+ * the indicated Atom. Atomese Values are vectors (of floats, bools,
+ * srings, or more complex structures). Each Value holds one column
+ * from the dataset. 
+ *
+ * The features (columns) specified in ignore_features will be omitted
+ * from the representation.
+ *
+ * For example, a CSV dataset like this:
+ * o, i1, i2, i3, i4
+ * 1, 0, 0, 3.3, "foo"
+ * 0, 1, 0, 4.4, "bar"
+ *
+ * will be loaded as the following key-value pairs on the `anchor` Atom:
+ * (Predicate "*-column-names-*") (StringValue "o", "i1", "i2", "i3", "i4")
+ * (Predicate "o") (BoolValue 1 0)
+ * (Predicate "i1") (BoolValue 0 1)
+ * (Predicate "i2") (BoolValue 0 0)
+ * (Predicate "i3") (FloatValue 3.3 4.4)
+ * (Predicate "i4") (StringValue "foo" "bar")
+ *
+ * @param file_name
+ * @param ignore_features
+ * @return
+ */
+void load_csv_table(
+	const Handle& anchor,
+	const std::string& file_name,
+	const std::vector<std::string>& ignore_features)
+{
+}
diff --git a/opencog/persist/csv/load_csv.h b/opencog/persist/csv/load_csv.h
index 6a2f2b45ae..a28a0b56ae 100644
--- a/opencog/persist/csv/load_csv.h
+++ b/opencog/persist/csv/load_csv.h
@@ -1,4 +1,5 @@
-/** load_csv.h ---
+/**
+ * load_csv.h -- Load CSV tables into Values
  *
  * Copyright (C) 2018 OpenCog Foundation
  * Copyright (C) 2022 Linas Vepstas
@@ -26,32 +27,8 @@
 
 namespace opencog {
 
-/**
- * Load columns from a CSV file and place them into Atomese Values on
- * the indicated Atom. Atomese Values are vectors (of floats, bools,
- * srings, or more complex structures). Each Value holds one column
- * from the dataset. 
- *
- * The features (columns) specified in ignore_features will be omitted
- * from the representation.
- *
- * For example, a CSV dataset like this:
- * o, i1, i2, i3, i4
- * 1, 0, 0, 3.3, "foo"
- * 0, 1, 0, 4.4, "bar"
- *
- * will be loaded as the following key-value pairs on the `anchor` Atom:
- * (Predicate "*-column-names-*") (StringValue "o", "i1", "i2", "i3", "i4")
- * (Predicate "o") (BoolValue 1 0)
- * (Predicate "i1") (BoolValue 0 1)
- * (Predicate "i2") (BoolValue 0 0)
- * (Predicate "i3") (FloatValue 3.3 4.4)
- * (Predicate "i4") (StringValue "foo" "bar")
- *
- * @param file_name
- * @param ignore_features
- * @return
- */
+// Load columns from a CSV file and place them into Atomese Values on
+// the indicated Atom. See the .cc file for additional info.
 void load_csv_table(
 	const Handle& anchor,
 	const std::string& file_name,

From b23a69ea6252bf5481eb82435be32b5bd0539bb9 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Tue, 2 Aug 2022 11:09:04 +0200
Subject: [PATCH 03/56] Copy code from asmoses

---
 opencog/persist/csv/table_io.cc | 1479 +++++++++++++++++++++++++++++++
 opencog/persist/csv/table_io.h  |  265 ++++++
 2 files changed, 1744 insertions(+)
 create mode 100644 opencog/persist/csv/table_io.cc
 create mode 100644 opencog/persist/csv/table_io.h

diff --git a/opencog/persist/csv/table_io.cc b/opencog/persist/csv/table_io.cc
new file mode 100644
index 0000000000..1f80c8fdc6
--- /dev/null
+++ b/opencog/persist/csv/table_io.cc
@@ -0,0 +1,1479 @@
+/** table_io.cc ---
+ *
+ * Copyright (C) 2010 OpenCog Foundation
+ * Copyright (C) 2012 Poulin Holdings LLC
+ *
+ * Authors: Nil Geisweiller <ngeiswei@gmail.com>
+ *          Linas Vepstas <linasvepstas@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License v3 as
+ * published by the Free Software Foundation and including the exceptions
+ * at http://opencog.org/wiki/Licenses
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <atomic>
+#include <iomanip>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/range/algorithm/find.hpp>
+#include <boost/range/algorithm/count_if.hpp>
+#include <boost/range/algorithm/transform.hpp>
+#include <boost/range/algorithm/sort.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/variant.hpp>
+
+#include <boost/spirit/include/phoenix_core.hpp>
+#include <boost/spirit/include/phoenix_operator.hpp>
+
+#include <opencog/util/dorepeat.h>
+#include <opencog/util/iostreamContainer.h>
+#include <opencog/util/oc_omp.h>
+#include <opencog/util/comprehension.h>
+
+#include "table.h"
+#include "table_io.h"
+
+namespace opencog { namespace combo {
+
+using namespace std;
+using namespace boost;
+using namespace boost::phoenix;
+using boost::phoenix::arg_names::arg1;
+
+// -------------------------------------------------------
+
+bool checkCarriageReturn(istream& in)
+{
+    char next_c = in.get();
+    if (next_c == '\r') // DOS format
+        next_c = in.get();
+    if (next_c == '\n')
+        return true;
+    return false;
+}
+
+void removeCarriageReturn(string& str)
+{
+    size_t s = str.size();
+    if ((s > 0) && (str[s-1] == '\r'))
+        str.resize(s-1);
+}
+
+//* Remove non-ascii characters at the bigining of the line, only.
+void removeNonASCII(string& str)
+{
+    while (str.size() && (unsigned char)str[0] > 127)
+        str = str.substr(1);
+}
+
+// -------------------------------------------------------
+// Return true if the character is one of the standard comment
+// delimiters.  Here, we define a 'standard delimiter' as one
+// of hash, bang or semicolon.
+bool is_comment(const char c)
+{
+    if ('#' == c) return true;
+    if (';' == c) return true;
+    if ('!' == c) return true;
+    if ('\n' == c) return true;
+    if ('\r' == c) return true;
+    if (0 == c) return true;
+    return false;
+}
+
+/// Get one line of actual data.
+/// This ignores lines that start with a 'standard comment char'
+///
+//
+// TODO: This routine should be extended so that comments that start
+// somewhere other than column 0 are also ignored.
+//
+// The signature of this routine is the same as std:getline()
+//
+istream &get_data_line(istream& is, string& line)
+{
+    while (1)
+    {
+        getline(is, line);
+        if (!is) return is;
+        if (is_comment(line[0])) continue;
+
+        // Remove weird symbols at the start of the line (only).
+        removeNonASCII(line);
+        // Remove carriage return at end of line (for DOS files).
+        removeCarriageReturn(line);
+
+        return is;
+    }
+}
+
+// -------------------------------------------------------
+
+static const char *sparse_delim = " : ";
+
+/**
+ * parse a pair of key/value in a parse dataset, using ':' as
+ * delimiter. For instance
+ * 
+ * parse_key_val("key : val")
+ *
+ * returns
+ *
+ * {"key", "val"}
+ *
+ * If no such delimiter is found then it return a pair with empty key
+ * and empty val.
+ */
+static pair<string, string>
+parse_key_val(string chunk)
+{
+    pair<string, string> res;
+    size_t pos = chunk.find(sparse_delim);
+    if (string::npos == pos)
+        return res;
+    string key = chunk.substr(0, pos);
+    boost::trim(key);
+    string val = chunk.substr(pos + strlen(sparse_delim));
+    boost::trim(val);
+    return {key, val};
+}
+        
+/**
+ * Take a row, return a tokenizer.  Tokenization uses the
+ * separator characters comma, blank, tab (',', ' ' or '\t').
+ */
+table_tokenizer get_row_tokenizer(const std::string& line)
+{
+    typedef boost::escaped_list_separator<char> separator;
+    typedef boost::tokenizer<separator> tokenizer;
+
+    // Tokenize line; currently, we allow tabs, commas, blanks.
+    static const separator sep("\\", ",\t ", "\"");
+    return tokenizer(line, sep);
+}
+
+// Same as above, but only allow commas as a column separator.
+table_tokenizer get_sparse_row_tokenizer(const string& line)
+{
+    typedef boost::escaped_list_separator<char> separator;
+    typedef boost::tokenizer<separator> tokenizer;
+
+    // Tokenize line; currently, we allow tabs, commas, blanks.
+    static const separator sep("\\", ",", "\"");
+    return tokenizer(line, sep);
+}
+
+/**
+ * Take a line and return a vector containing the elements parsed.
+ * Used by istreamTable. This will modify the line to remove leading
+ * non-ASCII characters, as well as stripping of any carriage-returns.
+ */
+vector<string> tokenizeSparseRow(const string& line)
+{
+    table_tokenizer tok = get_sparse_row_tokenizer(line);
+    vector<string> res;
+    for (string t : tok) {
+        boost::trim(t);
+        res.push_back(t);
+    }
+    return res;
+}
+
+// -------------------------------------------------------
+/**
+ * Given an input string, guess the type of the string.
+ * Inferable types are: boolean, contin and enum.
+ */
+type_node infer_type_from_token(const string& token)
+{
+    /* Prefered representation is T's and 0's, to maximize clarity,
+     * readability.  Numeric values are easily confused with contin
+     * type.
+     */
+    if (token == "0" ||
+        token == "1" ||
+        token == "T" ||
+        token == "F" ||
+        token == "t" ||
+        token == "f")
+        return id::boolean_type;
+
+    // If it starts with an alphabetic character, assume its a string
+    else if (isalpha(token[0]))
+        return id::enum_type;
+
+    // Hope that we can cast this to a float point number.
+    else {
+        try {
+            lexical_cast<contin_t>(token);
+            return id::contin_type;
+        }
+        catch(...) {
+            return id::ill_formed_type;
+        }
+    }
+}
+
+/**
+ * Given an input string, guess the type of the string.
+ * Inferable types are: boolean, contin and enum.
+ * Compare this to 'curr_guess', and upgrade the type inference
+ * if it can be done consistently.
+ */
+static type_node 
+infer_type_from_token2(type_node curr_guess, const string& token)
+{
+    type_node tokt = infer_type_from_token(token);
+
+    // First time, just go with the flow.
+    if (id::unknown_type == curr_guess)
+        return tokt;
+
+    // Yayy! its consistent!
+    if (tokt == curr_guess)
+        return tokt;
+
+    // If we saw 0,1 when expecting a contin, its a contin.
+    if ((id::contin_type == curr_guess) && (id::boolean_type == tokt))
+        return curr_guess;
+
+    // If we thought its a boolean 0,1 it might be a contin.
+    if ((id::boolean_type == curr_guess) && (id::contin_type == tokt))
+        return tokt;
+
+    // If we got to here, then there's some sort of unexpected
+    // inconsistency in the column types; we've got to presume that
+    // its just some crazy ascii string, i.e. enum_type.
+    return id::enum_type;
+}
+
+/// cast string "token" to a vertex of type "tipe"
+builtin token_to_boolean(const string& token)
+{
+    if ("0" == token || "F" == token || "f" == token)
+        return id::logical_false;
+    else if ("1" == token || "T" == token || "t" == token)
+        return id::logical_true;
+    else {
+        OC_ASSERT(false, "Expecting boolean value, got %s", token.c_str());
+        return builtin();
+    }
+}
+contin_t token_to_contin(const string& token)
+{
+    try {
+        return lexical_cast<contin_t>(token);
+    } catch(boost::bad_lexical_cast&) {
+        OC_ASSERT(false, "Could not cast %s to contin", token.c_str());
+        return contin_t();
+    }
+}
+vertex token_to_vertex(const type_node &tipe, const string& token)
+{
+    switch (tipe) {
+
+    case id::boolean_type:
+        return token_to_boolean(token);
+
+    case id::contin_type:
+        return token_to_contin(token);
+
+    case id::enum_type:
+        // Enum types must begin with an alpha character
+        if (isalpha(token[0]))
+            return enum_t(token);
+        OC_ASSERT(false, "Enum type must begin with alphabetic char, but %s doesn't", token.c_str());
+        break;
+
+    case id::definite_object_type:
+        return token;
+        break;
+
+    // Ugly hack ... the problem adressed here is that feature
+    // selection has to read and propagate columns of unknown type
+    // (typically, dates, times).  So we hack around this here.
+    case id::ill_formed_type:
+        return enum_t(token);
+        // return id::ill_formed_type;
+        // return id::null_vertex;
+        break;
+
+    default:
+        stringstream ss;
+        ss << "Unable to convert token \"" << token << "\" to type=" << tipe << endl;
+        OC_ASSERT(0, ss.str().c_str());
+    }
+
+    // unreachable
+    return id::null_vertex;
+}
+
+// ===========================================================
+// istream regular tables.
+
+/**
+ * Fill the input table, given a file in DSV (delimiter-seperated values)
+ * format.  The delimiters are ',', ' ' or '\t'.
+ *
+ * It stuffs all data into the table as strings; type conversion to
+ * the appropriate type, and thunking for the header, and ignoring
+ * certain features, must all be done as a separate step.
+ */
+istream& istreamRawITable(istream& in, ITable& tab,
+                          const vector<unsigned>& ignored_indices)
+{
+    streampos beg = in.tellg();
+
+    // Get the entire dataset into memory
+    string line;
+    std::vector<string> lines;
+
+    // Read first few by hand. The first might be labels, so we must
+    // get at least the second line. But the second line might have
+    // all default feature values (i.e. no colon), so get the third...
+    dorepeat(20) {
+        if (!get_data_line(in, line))
+            break;
+        // If it is a sparse file, we are outta here.
+        // Throw an std::exception, since we don't want to log this as an
+        // error (all the other exception types log to the log file).
+        if (string::npos != line.find (sparse_delim)) {
+            in.seekg(beg);
+            throw std::exception();
+        }
+        lines.push_back(line);
+    }
+
+    // Grab the rest of the file.
+    while (get_data_line(in, line))
+        lines.push_back(line);
+
+    // Determine the arity from the first line.
+    vector<string> fl = tokenizeRow<string>(lines[0], ignored_indices);
+    arity_t arity = fl.size();
+
+    std::atomic<int> arity_fail_row(-1);
+    auto parse_line = [&](size_t i)
+    {
+        // tokenize the line and fill the table with
+        tab[i] = tokenizeRow<string>(lines[i], ignored_indices);
+
+        // Check arity
+        if (arity != (arity_t)tab[i].size())
+            arity_fail_row = i + 1;
+    };
+
+    // Vector of indices [0, lines.size())
+    size_t ls = lines.size();
+    tab.resize(ls);
+    auto ir = boost::irange((size_t)0, ls);
+    vector<size_t> indices(ir.begin(), ir.end());
+    OMP_ALGO::for_each(indices.begin(), indices.end(), parse_line);
+
+    if (-1 != arity_fail_row) {
+        in.seekg(beg);
+        OC_ASSERT(false,
+                  "ERROR: Input file inconsistent: the %uth row has "
+                  "a different number of columns than the rest of the file.  "
+                  "All rows should have the same number of columns.\n",
+                  arity_fail_row.load());
+    }
+    return in;
+}
+
+vector<string> get_header(const string& file_name)
+{
+    ifstream in(file_name.c_str());
+    string line;
+    get_data_line(in, line);
+    return tokenizeRow<string>(line);
+}
+
+// ===========================================================
+/**
+ * Visitor to parse a list of strings (buried in a multi_type_seq)
+ * into a multi_type_seq containing the typed values given the input
+ * type signature.
+ */
+struct from_tokens_visitor : public boost::static_visitor<multi_type_seq>
+{
+    from_tokens_visitor(const type_node_seq& types) : _types(types) {
+        all_boolean = boost::count(types, id::boolean_type) == (int)types.size();
+        all_contin = boost::count(types, id::contin_type) == (int)types.size();
+    }
+    result_type operator()(const string_seq& seq) {
+        result_type res;
+        if (all_boolean) {
+            res = builtin_seq();
+            builtin_seq& bs = res.get_seq<builtin>();
+            boost::transform(seq, back_inserter(bs), token_to_boolean);
+        }
+        else if (all_contin) {
+            res = contin_seq();
+            contin_seq& cs = res.get_seq<contin_t>();
+            boost::transform(seq, back_inserter(cs), token_to_contin);
+        }
+        else {
+            res = vertex_seq();
+            vertex_seq& vs = res.get_seq<vertex>();
+            boost::transform(_types, seq, back_inserter(vs), token_to_vertex);
+        }
+        return res;
+    }
+    template<typename Seq> result_type operator()(const Seq& seq) {
+        OC_ASSERT(false, "You are not supposed to do that");
+        return result_type();
+    }
+    const type_node_seq& _types;
+    bool all_boolean, all_contin;
+};
+
+
+/**
+ * The class below tokenizes one row, and jams it into the table
+ */
+struct from_sparse_tokens_visitor : public from_tokens_visitor
+{
+    from_sparse_tokens_visitor(const type_node_seq& types,
+                               const std::map<const std::string, size_t>& index,
+                               size_t fixed_arity)
+        : from_tokens_visitor(types), _index(index), _fixed_arity(fixed_arity) {}
+    result_type operator()(const string_seq& seq) {
+        using std::transform;
+        using std::for_each;
+        result_type res;
+        if (all_boolean) {
+            res = builtin_seq(_types.size(), id::logical_false);
+            builtin_seq& bs = res.get_seq<builtin>();
+            auto begin_sparse = seq.begin() + _fixed_arity;
+            transform(seq.begin(), begin_sparse, bs.begin(), token_to_boolean);
+            for (auto it = begin_sparse; it != seq.end(); ++it) {
+                auto key_val = parse_key_val(*it);
+                if (key_val != std::pair<std::string, std::string>()) {
+                    size_t idx = _index.at(key_val.first);
+                    bs[idx] = token_to_boolean(key_val.second);
+                }
+            }
+        }
+        else if (all_contin) {
+            res = contin_seq(_types.size(), 0.0);
+            contin_seq& cs = res.get_seq<contin_t>();
+            auto begin_sparse = seq.cbegin() + _fixed_arity;
+            transform(seq.begin(), begin_sparse, cs.begin(), token_to_contin);
+            for (auto it = begin_sparse; it != seq.end(); ++it) {
+                auto key_val = parse_key_val(*it);
+                if (key_val != std::pair<std::string, std::string>()) {
+                    size_t idx = _index.at(key_val.first);
+                    cs[idx] = token_to_contin(key_val.second);
+                }
+            }
+        }
+        else {
+            res = vertex_seq(_types.size());
+            vertex_seq& vs = res.get_seq<vertex>();
+            auto begin_sparse_types = _types.cbegin() + _fixed_arity;
+            auto begin_sparse_seq = seq.cbegin() + _fixed_arity;
+            transform(_types.begin(), begin_sparse_types,
+                      seq.begin(), vs.begin(), token_to_vertex);
+            for (auto it = begin_sparse_seq; it != seq.end(); ++it) {
+                auto key_val = parse_key_val(*it);
+                if (key_val != std::pair<std::string, std::string>()) {
+                    size_t idx = _index.at(key_val.first);
+                    vs[idx] = token_to_vertex(_types[idx], key_val.second);
+                }
+            }
+        }
+        return res;
+    }
+    std::map<const std::string, size_t> _index;
+    size_t _fixed_arity;
+};
+
+
+// ===========================================================
+/**
+ * Fill the input table, given a file in 'sparse' format.
+ *
+ * The sparse table format consists of some fixed number of columns,
+ * in comma-separated format, followed by key-value pairs, also
+ * tab-separated. viz:
+ * 
+ *     val, val, val, name:val, name:val, name:val
+ * 
+ * Thus, for example, a row such as 
+ * 
+ *    earn, issued : 1, results : 2, ending : 1, including : 1
+ * 
+ * indicates that there one fixed column, of enum type, (the enum value
+ * being "earn"), and that features called "issued", "ending" and 
+ * "including" have a contin value of 1.0  and "results" has a contin
+ * value of 2.
+ * 
+ * The routine does NOT store the table in sparse format: it stores the
+ * full, exploded table. This could be bad ...
+ * TODO: we really need a sparse table format, as well.  
+ *
+ * The "Raw" format has all data as strings; type conversion to the
+ * appropriate type, must all be done as a separate step.
+ */
+istream& istreamSparseITable(istream& in, ITable& tab)
+{
+    // The raw dataset
+    std::vector<string> lines;
+
+    // The first non-comment line is assumed to be the header.
+    // ... unless it isn't. (The header must not contain a colon).
+    vector<string> labs;
+    size_t fixed_arity = 0;
+    string header;
+    get_data_line(in, header);
+    if (string::npos == header.find(sparse_delim)) {
+        // Determine the arity of the fixed columns
+        vector<string> hdr = tokenizeSparseRow(header);
+        fixed_arity = hdr.size();
+        labs = hdr;
+    }
+    else {
+        lines.push_back(header);
+    }
+
+    // Get the entire dataset into memory
+    string iline;
+    while (get_data_line(in, iline))
+        lines.push_back(iline);
+
+    if (0 == fixed_arity) {
+        vector<string> fixy = tokenizeSparseRow(lines[0]);
+        // count commas, until a semi-colon is found.
+        while (string::npos == fixy[fixed_arity].find(sparse_delim)) 
+            fixed_arity++;
+    }
+    logger().info() << "Sparse file fixed column count=" << fixed_arity;
+
+    // Get a list of all of the features.
+    set<string> feats;
+    // All sparse features have the same type.
+    type_node feat_type = id::unknown_type;
+
+    // Fixed features may have different types, by column.
+    type_node_seq types(fixed_arity, id::unknown_type);
+
+    for (const string& line : lines) {
+        vector<string> chunks = tokenizeSparseRow(line);
+        vector<string>::const_iterator pit = chunks.begin();
+
+        // Infer the types of the fixed features.
+        size_t off = 0;
+        for (; off < fixed_arity; ++off, ++pit)
+            types[off] = infer_type_from_token2(types[off], *pit);
+
+        for (; pit != chunks.end(); ++pit) {
+            // Rip out the key-value pairs
+            auto key_val = parse_key_val(*pit);
+            if (key_val == pair<string, string>())
+                break;
+            // Store the key, uniquely.  Store best guess as the type.
+            feats.insert(key_val.first);
+            feat_type = infer_type_from_token2(feat_type, key_val.second);
+        }
+    }
+    logger().info() << "Sparse file unique features count=" << feats.size();
+    logger().info() << "Sparse file feature type=" << feat_type;
+    logger().info() << "Sparse file row count=" << lines.size();
+
+    // Convert the feature set into a list of labels.
+    // 'index' is a map from feature name to column number.
+    size_t cnt = fixed_arity;
+    map<const string, size_t> index;
+    for (const string& key : feats) {
+        types.push_back(feat_type);
+        labs.push_back(key);
+        index[key] = cnt;
+        cnt++;
+    }
+    tab.set_labels(labs);
+    tab.set_types(types);
+
+    // And finally, stuff up the table.
+    from_sparse_tokens_visitor fstv(types, index, fixed_arity);
+    auto fill_line = [&](int i)
+    {
+        const string& line = lines[i];
+        // Tokenize the line
+        vector<string> chunks = tokenizeSparseRow(line);
+        multi_type_seq row = fstv(chunks);
+        tab[i] = row;
+    };
+
+    // Vector of indices [0, lines.size())
+    size_t ls = lines.size();
+    tab.resize(ls);
+    auto ir = boost::irange((size_t)0, ls);
+    vector<size_t> indices(ir.begin(), ir.end());
+    OMP_ALGO::for_each(indices.begin(), indices.end(), fill_line);
+
+    return in;
+}
+
+/**
+ * Infer the column types of the input table. It is assumed the
+ * table's rows are vector of strings.
+ */
+type_node_seq infer_column_types(const ITable& tab)
+{
+    vector<multi_type_seq>::const_iterator rowit = tab.begin();
+
+    arity_t arity = rowit->size();
+    type_node_seq types(arity, id::unknown_type);
+
+    // Skip the first line, it might be a header...
+    // and that would confuse type inference.
+    if (tab.size() > 1)
+        ++rowit;
+    for (; rowit != tab.end(); ++rowit)
+    {
+        const string_seq& tokens = rowit->get_seq<string>();
+        for (arity_t i=0; i<arity; i++)
+            types[i] = infer_type_from_token2(types[i], tokens[i]);
+    }
+    return types;
+}
+
+/**
+ * Infer the column types of the first line of a raw input table and
+ * compare it to the given column types.  If there is a mis-match,
+ * then the first row must be a header, i.e. a set of ascii column
+ * labels.
+ */
+bool has_header(ITable& tab, type_node_seq col_types)
+{
+    const string_seq& row = tab.begin()->get_seq<string>();
+
+    arity_t arity = row.size();
+
+    for (arity_t i=0; i<arity; i++) {
+        type_node flt = infer_type_from_token2(col_types[i], row[i]);
+        if ((id::enum_type == flt) && (id::enum_type != col_types[i]))
+            return true;
+    }
+    return false;
+}
+
+/**
+ * Infer the column types of a line and compare it to the given column
+ * types.  If there is a mis-match, then it must be a header, i.e. a
+ * set of ascii column labels.
+ */
+bool is_header(const vector<string>& tokens, const type_node_seq& col_types)
+{
+    for (size_t i = 0; i < tokens.size(); i++) {
+        type_node flt = infer_type_from_token2(col_types[i], tokens[i]);
+        if ((id::enum_type == flt) && (id::enum_type != col_types[i]))
+            return true;
+    }
+    return false;
+}
+
+/**
+ * Fill the input table only, given a DSV (delimiter-seperated values)
+ * file format, where delimiters are ',', ' ' or '\t'.
+ *
+ * This algorithm makes several passes over the data.  First, it reads
+ * the entire table, as a collection of strings.  Next, it tries to
+ * infer the column types, and the presence of a header.
+ */
+istream& istreamITable(istream& in, ITable& tab,
+                       const vector<string>& ignore_features)
+{
+    try {
+        istreamRawITable(in, tab);
+    }
+    catch (std::exception& e) {
+        istreamSparseITable(in, tab);
+        // Get rid of the unwanted columns.
+        tab.delete_columns(ignore_features);
+        return in;
+    }
+
+    // Determine the column types.
+    type_node_seq col_types = infer_column_types(tab);
+    tab.set_types(col_types);
+
+    // If there is a header row, then it must be the column labels.
+    if (has_header(tab, col_types)) {
+        tab.set_labels(tab.begin()->get_seq<string>());
+        tab.erase(tab.begin());
+    }
+
+    // Now that we have some column labels to work off of,
+    // Get rid of the unwanted columns.
+    tab.delete_columns(ignore_features);
+
+    // Finally, perform a column type conversion
+    from_tokens_visitor ftv(tab.get_types());
+    auto aft = apply_visitor(ftv);
+    OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(),
+                        [&](multi_type_seq& seq) {
+                            return aft(seq.get_variant());
+                        });
+
+    return in;
+}
+
+/**
+ * Like istreamITable but add the option to ignore indices.
+ *
+ * It's akind of a temporary hack, till it's clear that this is much
+ * faster and we should recode istreamITable to ignore features
+ * head-on.
+ *
+ * Also, it assumes that the dataset is not sparse.
+ */
+istream& istreamITable_ignore_indices(istream& in, ITable& tab,
+                                      const vector<unsigned>& ignore_indices)
+{
+    istreamRawITable(in, tab, ignore_indices);
+
+    // Determine the column types.
+    type_node_seq col_types = infer_column_types(tab);
+    tab.set_types(col_types);
+
+    // If there is a header row, then it must be the column labels.
+    if (has_header(tab, col_types)) {
+        tab.set_labels(tab.begin()->get_seq<string>());
+        tab.erase(tab.begin());
+    }
+
+    // Finally, perform a column type conversion
+    from_tokens_visitor ftv(tab.get_types());
+    auto aft = apply_visitor(ftv);
+    OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(),
+                        [&](multi_type_seq& seq) {
+                            return aft(seq.get_variant());
+                        });
+    
+    return in;
+}
+
+OTable loadOTable(const string& file_name, const string& target_feature)
+{
+    vector<string> ignore_features;
+    for (const string& l : get_header(file_name))
+        if (l != target_feature)
+            ignore_features.push_back(l);
+
+    ITable itab = loadITable(file_name, ignore_features);
+    OTable res(itab.get_column_data(target_feature), target_feature);
+    return res;
+}
+
+/**
+ * Take a line and return a triple with vector containing the input
+ * elements, output element and timestamp.
+ */
+std::tuple<vector<string>, string, string>
+tokenizeRowIOT(const std::string& line,
+               const std::vector<unsigned>& ignored_indices,
+               int target_idx,  // < 0 == ignored
+               int timestamp_idx) // < 0 == ignored
+{
+    std::tuple<std::vector<string>, string, string> res;
+    table_tokenizer toker = get_row_tokenizer(line);
+    int i = 0;
+    for (const std::string& tok : toker) {
+        if (!boost::binary_search(ignored_indices, i)) {
+            string el = boost::lexical_cast<string>(tok);
+            if (target_idx == i)
+                std::get<1>(res) = el;
+            else if (timestamp_idx == i)
+                std::get<2>(res) = el;
+            else
+                std::get<0>(res).push_back(el);
+        }
+        i++;
+    }
+    return res;
+}
+
+ITable loadITable(const string& file_name,
+                  const vector<string>& ignore_features)
+{
+    OC_ASSERT(!file_name.empty(), "the file name is empty");
+    ifstream in(file_name.c_str());
+    OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str());
+
+    ITable res;
+    istreamITable(in, res, ignore_features);
+    return res;
+}
+
+/**
+ * Like loadITable but it is optimized by ignoring features head-on
+ * (rather than loading them, then removing them.
+ *
+ * WARNING: it assumes the dataset has a header!!!
+ */
+ITable loadITable_optimized(const string& file_name,
+                            const vector<string>& ignore_features)
+{
+    OC_ASSERT(!file_name.empty(), "the file name is empty");
+    ifstream in(file_name.c_str());
+    OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str());
+
+    // determined ignore_indices
+    vector<unsigned> ignore_indices = get_indices(ignore_features,
+                                                  get_header(file_name));
+    
+    ITable res;
+    istreamITable_ignore_indices(in, res, ignore_indices);
+    return res;
+}
+
+/**
+ * Fill an input table and output table given a DSV
+ * (delimiter-seperated values) file format, where delimiters are ',',
+ * ' ' or '\t'.
+ *
+ * It is assumed that each row have the same number of columns, if not
+ * an assert is raised.
+ *
+ * pos specifies the position of the output, if -1 it is the last
+ * position. The default position is 0, the first column.
+ *
+ * This is only used for sparse table and could be optimized
+ */
+istream& istreamTable_OLD(istream& in, Table& tab,
+                          const string& target_feature,
+                          const vector<string>& ignore_features)
+{
+    istreamITable(in, tab.itable, ignore_features);
+
+    tab.otable = tab.itable.get_column_data(target_feature);
+    OC_ASSERT(0 != tab.otable.size(), 
+              "Fatal Error: target feature \"%s\" not found",
+              target_feature.c_str());
+
+    tab.target_pos = tab.itable.get_column_offset(target_feature);
+    
+    type_node targ_type = tab.itable.get_type(target_feature);
+
+    string targ_feat = tab.itable.delete_column(target_feature);
+
+    tab.otable.set_label(targ_feat);
+    tab.otable.set_type(targ_type);
+
+    return in;
+}
+
+/**
+ * Like istreamTable but optimize by ignoring features head-on rather
+ * than loading them then removing them.
+ *
+ * Warning: only works on dense data with header file.
+ */
+istream& istreamTable_ignore_indices(istream& in, Table& tab,
+                                     const string& target_feature,
+                                     const vector<unsigned>& ignore_indices)
+{    
+    istreamITable_ignore_indices(in, tab.itable, ignore_indices);
+
+    tab.otable = tab.itable.get_column_data(target_feature);
+    OC_ASSERT(0 != tab.otable.size(), 
+              "Fatal Error: target feature \"%s\" not found",
+              target_feature.c_str());
+
+    tab.target_pos = tab.itable.get_column_offset(target_feature);
+    
+    type_node targ_type = tab.itable.get_type(target_feature);
+
+    string targ_feat = tab.itable.delete_column(target_feature);
+
+    tab.otable.set_label(targ_feat);
+    tab.otable.set_type(targ_type);
+
+    return in;
+}
+
+// ==================================================================
+
+static istream&
+inferTableAttributes(istream& in, const string& target_feature,
+                     const string& timestamp_feature,
+                     const vector<string>& ignore_features,
+                     type_tree& tt, bool& has_header, bool& is_sparse)
+{
+    // maxline is the maximum number of lines to read to infer the
+    // attributes. A negative number means reading all lines.
+    int maxline = 20;
+    streampos beg = in.tellg();
+
+    // Get a portion of the dataset into memory (cleaning weird stuff)
+    std::vector<string> lines;
+    {
+        string line;
+        is_sparse = false;
+        while (get_data_line(in, line) && maxline-- > 0) {
+            // It is sparse
+            is_sparse = is_sparse || string::npos != line.find(sparse_delim);
+            if (is_sparse) { // just get out
+                // TODO could be simplified, optimized, etc
+                in.seekg(beg);
+                in.clear();         // in case it has reached the eof
+                return in;
+            }
+
+            // put the line in a buffer
+            lines.push_back(line);
+        }
+    }
+
+    // parse what could be a header
+    const vector<string> maybe_header = tokenizeRow<string>(lines.front());
+
+    // determine arity
+    arity_t arity = maybe_header.size();
+    std::atomic<int> arity_fail_row(-1);
+
+    // determine initial type
+    type_node_seq types(arity, id::unknown_type);
+
+    // parse the rest, determine its type and whether the arity is
+    // consistent
+    for (size_t i = 1; i < lines.size(); ++i) {
+        // Parse line
+        const string_seq& tokens = tokenizeRow<string>(lines[i]);
+
+        // Check arity
+        if (arity != (arity_t)tokens.size()) {
+            arity_fail_row = i + 1;
+            in.seekg(beg);
+            in.clear();         // in case it has reached the eof
+            OC_ASSERT(false,
+                      "ERROR: Input file inconsistent: the %uth row has a "
+                      "different number of columns than the rest of the file.  "
+                      "All rows should have the same number of columns.\n",
+                      arity_fail_row.load());
+        }
+
+        // Infer type
+        boost::transform(types, tokens, types.begin(),
+                         infer_type_from_token2);
+    }
+
+    // Determine has_header
+    has_header = is_header(maybe_header, types);
+
+    // Determine type signature
+    if (has_header) {
+
+        // if unspecified, the target is the first column
+        unsigned target_idx = 0;
+
+        // target feature will be ignored
+        if (!target_feature.empty()) {
+            auto target_it = std::find(maybe_header.begin(), maybe_header.end(),
+                                       target_feature);
+            OC_ASSERT(target_it != maybe_header.end(), "Target %s not found",
+                      target_feature.c_str());
+            target_idx = std::distance(maybe_header.begin(), target_it);
+        }
+        vector<unsigned> ignore_idxs =
+            get_indices(ignore_features, maybe_header);
+        ignore_idxs.push_back(target_idx);
+        boost::sort(ignore_idxs);
+
+        // Include timestamp feature as idx to ignore
+        if (!timestamp_feature.empty()) {
+            auto timestamp_it = std::find(maybe_header.begin(), maybe_header.end(),
+                                          timestamp_feature);
+            OC_ASSERT(timestamp_it != maybe_header.end(),
+                      "Timestamp feature  %s not found",
+                      timestamp_feature.c_str());
+            unsigned timestamp_idx = std::distance(maybe_header.begin(), timestamp_it);
+            ignore_idxs.push_back(timestamp_idx);
+            boost::sort(ignore_idxs);
+        }
+
+        // Generate type signature
+        type_node otype = types[target_idx];
+        type_node_seq itypes;
+        for (unsigned i = 0; i < types.size(); ++i)
+            if (!boost::binary_search(ignore_idxs, i))
+                itypes.push_back(types[i]);
+        tt = gen_signature(itypes, otype);
+    } else {
+        // No header, the target is the first column
+        type_node otype = types[0];
+        types.erase(types.begin());
+        tt = gen_signature(types, otype);
+    }
+    logger().debug() << "Infered type tree: " << tt;
+
+    in.seekg(beg);
+    in.clear();         // in case it has reached the eof
+    return in;
+}
+
+/**
+ * Perform 2 passes:
+ *
+ * 1) Infer
+ * 1.1) its type
+ * 1.2) whether it has a header
+ * 1.3) whether it is dense or sparse
+ *
+ * 2) Load the actual data.
+ */
+istream& istreamTable(istream& in, Table& tab,
+                      const string& target_feature,
+                      const string& timestamp_feature,
+                      const vector<string>& ignore_features)
+{
+    // Infer the properties of the table without loading its content
+    type_tree tt;
+    bool has_header, is_sparse;
+    streampos beg = in.tellg();
+    inferTableAttributes(in, target_feature, timestamp_feature,
+                         ignore_features, tt, has_header, is_sparse);
+    in.seekg(beg);
+
+    if (is_sparse) {
+        // fallback on the old loader
+        // TODO: this could definitely be optimized
+        OC_ASSERT(timestamp_feature.empty(), "Timestamp feature not implemented");
+        return istreamTable_OLD(in, tab, target_feature, ignore_features);
+    } else {
+        return istreamDenseTable(in, tab, target_feature, timestamp_feature,
+                                 ignore_features, tt, has_header);
+    }
+}
+
+// ==================================================================
+
+/**
+ * Take a line and return a pair with vector containing the input
+ * elements and then output element.
+ */
+template<typename T>
+std::pair<std::vector<T>, T>
+tokenizeRowIO(
+    const std::string& line,
+    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>(),
+    unsigned target_idx=0)
+{
+    std::pair<std::vector<T>, T> res;
+    table_tokenizer toker = get_row_tokenizer(line);
+    size_t i = 0;
+    for (const std::string& tok : toker) {
+        if (!boost::binary_search(ignored_indices, i)) {
+            T el = boost::lexical_cast<T>(tok);
+            if (target_idx == i)
+                res.second = el;
+            else
+                res.first.push_back(el);
+        }
+        i++;
+    }
+    return res;
+}
+
+// ==================================================================
+
+static istream&
+istreamDenseTable_noHeader(istream& in, Table& tab,
+                           int target_idx, // < 0 == ignore
+                           int timestamp_idx, // < 0 == ignore
+                           const vector<unsigned>& ignore_idxs,
+                           const type_tree& tt, bool has_header)
+{
+    // Get the entire dataset into memory (cleaning weird stuff)
+    string line;
+    std::vector<string> lines;
+    while (get_data_line(in, line))
+        lines.push_back(line);
+
+    // Allocate all rows in the itable, otable and ttable
+    tab.itable.resize(lines.size());
+    tab.otable.resize(lines.size());
+    if (timestamp_idx >= 0)
+        tab.ttable.resize(lines.size());
+
+    // Get the elementary io types
+    type_node_seq itypes =
+        vector_comp(get_signature_inputs(tt), get_type_node);
+    type_node otype = get_type_node(get_signature_output(tt));
+
+    // Assign the io type to the table
+    tab.itable.set_types(itypes);
+    tab.otable.set_type(otype);
+
+    // Instantiate type conversion for inputs
+    from_tokens_visitor ftv(itypes);
+
+    // Function to parse each line (to be called in parallel)
+    auto parse_line = [&](unsigned i) {
+        try {
+            // Fill input
+            auto tokenIOT = tokenizeRowIOT(lines[i], ignore_idxs,
+                                           target_idx, timestamp_idx);
+            tab.itable[i] = ftv(std::get<0>(tokenIOT));
+
+            // Fill output
+            string output_str = std::get<1>(tokenIOT);
+            // If there is no valid target index, then there is no
+            // "output" column!
+            if (""  != output_str)
+                tab.otable[i] = token_to_vertex(otype, output_str);
+
+            // Fill date
+            string date_str = std::get<2>(tokenIOT);
+            // If there is no valid timestamp index, then there is no
+            // "output" column!
+            if (""  != date_str)
+                tab.ttable[i] = TTable::from_string(date_str);
+        }
+        catch (AssertionException& ex) {
+            unsigned lineno = has_header? i+1 : i;
+            OC_ASSERT(false, "Parsing error occurred on line %d of input file\n"
+                             "Exception: %s", lineno, ex.what());
+        }
+    };
+
+    // Call it for each line in parallel
+    auto ir = boost::irange((size_t)0, lines.size());
+    vector<size_t> row_idxs(ir.begin(), ir.end());
+    OMP_ALGO::for_each(row_idxs.begin(), row_idxs.end(), parse_line);
+
+    // Assign the target position relative to the ignored indices
+    // (useful for writing that file back)
+    tab.target_pos = target_idx - boost::count_if(ignore_idxs,
+                                                  arg1 < target_idx);
+
+    if (timestamp_idx >= 0)
+        tab.timestamp_pos = timestamp_idx -
+            boost::count_if(ignore_idxs, arg1 < timestamp_idx);
+
+    return in;
+}
+
+istream& istreamDenseTable(istream& in, Table& tab,
+                           const string& target_feature,
+                           const string& timestamp_feature,
+                           const vector<string>& ignore_features,
+                           const type_tree& tt, bool has_header)
+{
+    OC_ASSERT(has_header
+              || (target_feature.empty()
+                  && ignore_features.empty()
+                  && timestamp_feature.empty()),
+              "If the data file has no header, "
+              "then a target feature, ignore features or "
+              "timestamp_feature cannot be specified");
+
+    // determine target, timestamp and ignore indexes
+    int target_idx = 0;    // if no header, target is at the first
+                           // column by default
+
+    int timestamp_idx = -1;     // disabled by default
+    vector<unsigned> ignore_idxs;
+    if (has_header) {
+        string line;
+        get_data_line(in, line);
+        vector<string> header = tokenizeRow<string>(line);
+
+        // Set target idx
+        if (!target_feature.empty()) {
+            auto target_it = std::find(header.begin(), header.end(),
+                                       target_feature);
+            OC_ASSERT(target_it != header.end(), "Target %s not found",
+                      target_feature.c_str());
+            target_idx = std::distance(header.begin(), target_it);
+        }
+
+        // Set timestamp idx
+        if (!timestamp_feature.empty()) {
+            auto timestamp_it = std::find(header.begin(), header.end(),
+                                          timestamp_feature);
+            OC_ASSERT(timestamp_it != header.end(), "Timestamp feature %s not found",
+                      timestamp_feature.c_str());
+            timestamp_idx = std::distance(header.begin(), timestamp_it);
+        }
+
+        // Set ignore idxs
+        ignore_idxs = get_indices(ignore_features, header);
+
+        // get input and output labels from the header
+        auto iotlabels = tokenizeRowIOT(line, ignore_idxs,
+                                        target_idx, timestamp_idx);
+        tab.itable.set_labels(std::get<0>(iotlabels));
+        tab.otable.set_label(std::get<1>(iotlabels));
+        tab.ttable.set_label(std::get<2>(iotlabels));
+    }
+
+    return istreamDenseTable_noHeader(in, tab, target_idx, timestamp_idx,
+                                      ignore_idxs, tt, has_header);
+}
+
+// ==================================================================
+
+// Parse a CompressedTable row
+// TODO: implement timestamp support
+CompressedTable::value_type parseCompressedTableRow(const type_tree& tt, const std::string& row_str)
+{
+    // split the string between input and output
+    unsigned end_outputs_pos = row_str.find("}");
+    string outputs = row_str.substr(1, end_outputs_pos - 1),
+        inputs = row_str.substr(end_outputs_pos + 2); // +2 to go
+                                                      // passed the
+                                                      // following ,
+
+    // convert the inputs string into multi_type_seq
+    type_node_seq tns = vector_comp(get_signature_inputs(tt), get_type_node);
+    vector<string> input_seq = tokenizeRow<string>(inputs);
+    from_tokens_visitor ftv(tns);
+    multi_type_seq input_values = ftv(input_seq);
+
+    // convert the outputs string into CompressedTable::counter_t
+    vector<string> output_pair_seq = tokenizeRow<string>(outputs);
+    CompressedTable::counter_t counter;
+    for (const string& pair_str : output_pair_seq) {
+        unsigned sep_pos = pair_str.find(":");
+        string key_str = pair_str.substr(0, sep_pos),
+            value_str = pair_str.substr(sep_pos + 1);
+        vertex v = token_to_vertex(get_type_node(get_signature_output(tt)),
+                                   key_str);
+        count_t count = atof(value_str.c_str());
+        counter[TimedValue(v)] = count;
+    }
+    return CompressedTable::value_type(input_values, counter);
+}
+
+// WARNING: this implementation only supports boolean ctable!!!!
+std::istream& istreamCompressedTable(std::istream& in, CompressedTable& ctable)
+{
+    ////////////////
+    // set header //
+    ////////////////
+    string header_line;
+    get_data_line(in, header_line);
+    auto labels = tokenizeRow<string>(header_line);
+    ctable.set_labels(labels);
+
+    ////////////////////////
+    // set type signature //
+    ////////////////////////
+    // HACK THIS PART TO MAKE IT SUPPORT OTHER TYPES THAN BOOLEAN
+    ctable.set_signature(gen_signature(id::boolean_type, ctable.get_arity()));
+
+    /////////////////
+    // set content //
+    /////////////////
+    std::vector<string> lines;
+    // read the entire file
+    {
+        string line;
+        while (get_data_line(in, line))
+            lines.push_back(line);
+    }
+    // parse each line and fill the ctable
+    for (const string& line : lines)
+        ctable.insert(parseCompressedTableRow(ctable.get_signature(), line));
+
+    return in;
+}
+
+Table loadTable(const std::string& file_name,
+                const std::string& target_feature,
+                const std::string& timestamp_feature,
+                const string_seq& ignore_features)
+{
+    OC_ASSERT(!file_name.empty(), "the file name is empty");
+    ifstream in(file_name.c_str());
+    OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str());
+
+    Table res;
+    istreamTable(in, res, target_feature, timestamp_feature, ignore_features);
+    return res;
+}
+
+CompressedTable loadCompressedTable(const string& file_name)
+{
+    CompressedTable ctable;
+    OC_ASSERT(!file_name.empty(), "No filename specified!");
+    ifstream in(file_name.c_str());
+    istreamCompressedTable(in, ctable);
+    return ctable;
+}
+
+// ===========================================================
+// ostream regular tables
+
+void saveTable(const string& file_name, const Table& table)
+{
+    OC_ASSERT(!file_name.empty(), "No filename specified!");
+    ofstream out(file_name.c_str());
+    OC_ASSERT(out.is_open(), "Could not open %s", file_name.c_str());
+    ostreamTable(out, table);
+}
+
+// ===========================================================
+// ostream CompressedTables
+
+ostream& ostreamCompressedTableHeader(ostream& out, const CompressedTable& ct)
+{
+    return ostreamln_container(out, ct.get_labels(), ",");
+}
+
+ostream& ostreamCompressedTableRow(ostream& out, const CompressedTable::value_type& ctv)
+{
+    to_strings_visitor tsv;
+    auto ats = boost::apply_visitor(tsv);
+    // print map of outputs
+    out << "{";
+    for(auto it = ctv.second.cbegin(); it != ctv.second.cend();) {
+        if (it->first.timestamp != boost::gregorian::date())
+            out << "(" << table_fmt_vertex_to_str(it->first.value)
+                << "," << it->first.timestamp << "):" << it->second;
+        else
+            out << table_fmt_vertex_to_str(it->first.value)
+                << ":" << it->second;
+        if (++it != ctv.second.cend())
+            out << ",";
+    }
+    out << "},";
+    // print inputs
+    return ostreamln_container(out, ats(ctv.first.get_variant()), ",");
+}
+
+ostream& ostreamCompressedTable(ostream& out, const CompressedTable& ct)
+{
+    // print header
+    ostreamCompressedTableHeader(out, ct);
+    // print data
+    for (const auto& v : ct)
+        ostreamCompressedTableRow(out, v);
+
+    return out;
+}
+
+ostream& ostreamCompressedTableTimeHeader(ostream& out, const CompressedTableTime& ctt)
+{
+    out << "timestamp,output" << endl;
+    return out;
+}
+
+ostream& ostreamCompressedTableTimeRow(ostream& out, const CompressedTableTime::value_type& tio)
+{
+    out << tio.first << ",{";
+    for (auto it = tio.second.cbegin(); it != tio.second.cend();) {
+        out << table_fmt_vertex_to_str(it->first)
+            << ":" << it->second;
+        if(++it != tio.second.cend())
+            out << ",";
+    }
+    out << "}" << endl;
+    return out;
+}
+
+ostream& ostreamCompressedTableTime(ostream& out, const CompressedTableTime& ctt)
+{
+    // print header
+    ostreamCompressedTableTimeHeader(out, ctt);
+
+    // print data by time
+    for (const auto& tio : ctt)
+        ostreamCompressedTableTimeRow(out, tio);
+
+    return out;
+}
+
+// ===========================================================
+// operator<< for the various tables and stuff.
+
+ostream& operator<<(ostream& out, const ITable& it)
+{
+    ostreamln_container(out, it.get_labels(), ",");
+    ostreamln_container(out, it.get_types(), ",");
+    to_strings_visitor tsv;
+    for (const auto& row : it) {
+        vector<string> row_str = boost::apply_visitor(tsv, row.get_variant());
+        ostreamln_container(out, row_str, ",");
+    }
+    return out;
+}
+
+ostream& operator<<(ostream& out, const OTable& ot)
+{
+    if (!ot.get_label().empty())
+        out << ot.get_label() << endl;
+    out << ot.get_type() << endl;
+    for (const vertex& v : ot)
+        out << table_fmt_vertex_to_str(v) << endl;
+    return out;
+}
+
+ostream& operator<<(ostream& out, const Table& table)
+{
+    return ostreamTable(out, table);
+}
+
+ostream& operator<<(ostream& out, const complete_truth_table& tt)
+{
+    return ostream_container(out, tt);
+}
+
+ostream& operator<<(ostream& out, const CompressedTable& ct)
+{
+    return ostreamCompressedTable(out, ct);
+}
+
+} // ~namespaces combo
+
+std::string oc_to_string(const combo::ITable& it, const std::string& indent)
+{
+    std::stringstream ss;
+    ss << it;
+    return ss.str();
+}
+
+std::string oc_to_string(const combo::OTable& ot, const std::string& indent)
+{
+    std::stringstream ss;
+    ss << ot;
+    return ss.str();
+}
+
+std::string oc_to_string(const combo::Table& table, const std::string& indent)
+{
+    std::stringstream ss;
+    ss << table;
+    return ss.str();
+}
+
+std::string oc_to_string(const combo::CompressedTable& ct, const std::string& indent)
+{
+    std::stringstream ss;
+    ss << ct;
+    return ss.str();
+}
+
+std::string oc_to_string(const combo::complete_truth_table& tt,
+                         const std::string& indent)
+{
+    std::stringstream ss;
+    ss << tt;
+    return ss.str();
+}
+
+} // ~namespaces opencog
diff --git a/opencog/persist/csv/table_io.h b/opencog/persist/csv/table_io.h
new file mode 100644
index 0000000000..43d95635cb
--- /dev/null
+++ b/opencog/persist/csv/table_io.h
@@ -0,0 +1,265 @@
+/**
+ * table_io.h ---
+ *
+ * Copyright (C) 2010 OpenCog Foundation
+ * Copyright (C) 2012 Poulin Holdings LLC
+ *
+ * Authors: Nil Geisweiller <ngeiswei@gmail.com>
+ *          Linas Vepstas <linasvepstas@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License v3 as
+ * published by the Free Software Foundation and including the exceptions
+ * at http://opencog.org/wiki/Licenses
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+#ifndef _OPENCOG_TABLE_IO_H
+#define _OPENCOG_TABLE_IO_H
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/range/algorithm/count.hpp>
+#include <boost/range/algorithm/binary_search.hpp>
+#include <boost/range/algorithm_ext/for_each.hpp>
+#include <boost/tokenizer.hpp>
+
+#include "table.h"
+#include "opencog/asmoses/combo/type_checker/type_tree.h"
+
+namespace opencog { namespace combo {
+
+/**
+ * remove the carriage return (for DOS format)
+ */
+void removeCarriageReturn(std::string& str);
+
+/**
+ * remove non ASCII char at the begining of the string
+ */
+void removeNonASCII(std::string& str);
+
+/**
+ * Return true if the next chars in 'in' correspond to carriage return
+ * (support UNIX and DOS format) and advance in of the checked chars.
+ */
+bool checkCarriageReturn(std::istream& in);
+
+/**
+ * Convert strings to typed values
+ */
+builtin token_to_boolean(const std::string& token);
+contin_t token_to_contin(const std::string& token);
+vertex token_to_vertex(const type_node &tipe, const std::string& token);
+
+
+// ===========================================================
+
+typedef boost::tokenizer<boost::escaped_list_separator<char>> table_tokenizer;
+
+/**
+ * Take a row, return a tokenizer.  Tokenization uses the
+ * separator characters comma, blank, tab (',', ' ' or '\t').
+ */
+table_tokenizer get_row_tokenizer(const std::string& line);
+
+/**
+ * Take a line and return a vector containing the elements parsed.
+ * Used by istreamTable.
+ */
+template<typename T>
+static std::vector<T> tokenizeRow(
+    const std::string& line,
+    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>())
+{
+    table_tokenizer tok = get_row_tokenizer(line);
+    std::vector<T> res;
+    unsigned i = 0;
+    for (const std::string& t : tok) {
+
+        // trim away whitespace padding; failing to do this
+        // confuses stuff downstream.
+        std::string clean(t);
+        boost::trim(clean);
+
+        // Sometimes the tokenizer returns pure whitespace :-(
+        if (0 == clean.size()) continue;
+
+        if (!boost::binary_search(ignored_indices, i++))
+            res.push_back(boost::lexical_cast<T>(clean));
+    }
+    return res;
+}
+
+// ===========================================================
+
+//////////////////
+// istreamTable //
+//////////////////
+
+// some hacky function to get the header of a DSV file (assuming there is one)
+string_seq get_header(const std::string& input_file);
+
+std::istream& istreamRawITable(
+    std::istream& in, ITable& tab,
+    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>());
+
+std::istream& istreamITable(std::istream& in, ITable& tab,
+                           const string_seq& ignore_features);
+
+std::istream& istreamTable(std::istream& in, Table& tab,
+                           const std::string& target_feature,
+                           const std::string& timestamp_feature,
+                           const string_seq& ignore_features);
+
+// WARNING: this implementation only supports boolean ctable!!!!
+std::istream& istreamCompressedTable(std::istream& in, CompressedTable& ctable);
+
+/**
+ * Load a OTable given the file name. Only works for dense DSV data.
+ */
+OTable loadOTable(const std::string& file_name,
+                  const std::string& target_feature);
+
+// TODO: reimplement loadITable with the same model of loadTable and
+// remove loadITable_optimized
+ITable loadITable(
+    const std::string& file_name,
+    const string_seq& ignore_features=string_seq());
+
+ITable loadITable_optimized(
+    const std::string& file_name,
+    const string_seq& ignore_features=string_seq());
+
+/**
+ * If target_feature is empty then, in case there is no header, it is
+ * assumed to be the first feature.
+ */
+Table loadTable(
+    const std::string& file_name,
+    const std::string& target_feature=std::string(),
+    const std::string& timestamp_feature=std::string(),
+    const string_seq& ignore_features=string_seq());
+
+std::istream& istreamDenseTable(std::istream& in, Table& tab,
+                                const std::string& target_feature,
+                                const std::string& timestamp_feature,
+                                const string_seq& ignore_features,
+                                const type_tree& tt, bool has_header);
+
+// WARNING: this implementation only supports boolean ctable!!!!
+CompressedTable loadCompressedTable(const std::string& file_name);
+
+//////////////////
+// ostreamTable //
+//////////////////
+
+/// output the header of a data table in CSV format.
+template<typename Out>
+Out& ostreamTableHeader(Out& out, const Table& table)
+{
+    // Add input features in header
+    string_seq header = table.itable.get_labels();
+    unsigned hsize = header.size();
+
+    // Add target feature in header
+    const std::string& ol = table.otable.get_label();
+    header.insert(header.begin() + std::min(table.target_pos, hsize), ol);
+
+    // Add timestamp feature in header
+    if (!table.ttable.empty()) {
+        const std::string& tl = table.ttable.get_label();
+        header.insert(header.begin() + table.timestamp_pos, tl);
+    }
+
+    // Write the header
+    ostream_container(out, header, ",") << std::endl;
+    return out;
+}
+
+/// Output a data table in CSV format. Boolean values are output in
+/// binary form (0 for false, 1 for true).
+template<typename Out>
+Out& ostreamTable(Out& out, const Table& table)
+{
+    // print header
+    ostreamTableHeader(out, table);
+
+    // print data
+    unsigned isize = table.itable.size(), osize = table.otable.size();
+    OC_ASSERT(table.itable.empty() || isize == osize);
+    for (size_t row = 0; row < osize; ++row) {
+        // Add input values
+        string_seq content;
+        if (!table.itable.empty())
+            content = table.itable[row].to_strings();
+        unsigned csize = content.size();
+
+        // Add target feature value
+        std::string oc = table_fmt_vertex_to_str(table.otable[row]);
+        content.insert(content.begin() + std::min(table.target_pos, csize), oc);
+
+        // Add timestamp feature value
+        if (!table.ttable.empty()) {
+            std::string tc = TTable::to_string(table.ttable[row]);
+            content.insert(content.begin() + table.timestamp_pos, tc);
+        }
+
+        // Write content row
+        ostream_container(out, content, ",") << std::endl;
+    }
+    return out;
+}
+
+/// like above but take a table instead of a input and output table
+void saveTable(const std::string& file_name, const Table& table);
+
+/// output a compressed table in pseudo CSV format
+std::ostream& ostreamCompressedTableRow(std::ostream& out, const CompressedTable::value_type& ctv);
+std::ostream& ostreamCompressedTable(std::ostream& out, const CompressedTable& ct);
+
+/// Output a compressed table with each row corresponding to a
+/// timestamp, chronologically ordered.
+std::ostream& ostreamCompressedTableTime(std::ostream& out, const CompressedTableTime& ctt);
+
+std::ostream& operator<<(std::ostream& out, const ITable& it);
+
+std::ostream& operator<<(std::ostream& out, const OTable& ot);
+
+std::ostream& operator<<(std::ostream& out, const Table& table);
+
+std::ostream& operator<<(std::ostream& out, const CompressedTable& ct);
+
+std::ostream& operator<<(std::ostream& out, const complete_truth_table& tt);
+
+} // ~namespaces combo
+
+// For pretty printing OpenCog objects while debugging, see
+// https://wiki.opencog.org/w/Development_standards#Pretty_Print_OpenCog_Objects
+std::string oc_to_string(const combo::ITable& it,
+                         const std::string& indent=empty_string);
+std::string oc_to_string(const combo::OTable& ot,
+                         const std::string& indent=empty_string);
+std::string oc_to_string(const combo::Table& table,
+                         const std::string& indent=empty_string);
+std::string oc_to_string(const combo::CompressedTable& ct,
+                         const std::string& indent=empty_string);
+std::string oc_to_string(const combo::complete_truth_table& tt,
+                         const std::string& indent=empty_string);
+
+} // ~namespaces opencog
+
+#endif // _OPENCOG_TABLE_IO_H

From 0c75df43b9d3bc7c29af10e9dffae327f557c5b0 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Tue, 2 Aug 2022 11:23:37 +0200
Subject: [PATCH 04/56] Cut down the original code to only the readers

---
 opencog/persist/csv/table_io.h                | 265 ----------------
 .../csv/{table_io.cc => table_read.cc}        | 295 ++----------------
 opencog/persist/csv/table_read.h              | 143 +++++++++
 3 files changed, 167 insertions(+), 536 deletions(-)
 delete mode 100644 opencog/persist/csv/table_io.h
 rename opencog/persist/csv/{table_io.cc => table_read.cc} (84%)
 create mode 100644 opencog/persist/csv/table_read.h

diff --git a/opencog/persist/csv/table_io.h b/opencog/persist/csv/table_io.h
deleted file mode 100644
index 43d95635cb..0000000000
--- a/opencog/persist/csv/table_io.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/**
- * table_io.h ---
- *
- * Copyright (C) 2010 OpenCog Foundation
- * Copyright (C) 2012 Poulin Holdings LLC
- *
- * Authors: Nil Geisweiller <ngeiswei@gmail.com>
- *          Linas Vepstas <linasvepstas@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License v3 as
- * published by the Free Software Foundation and including the exceptions
- * at http://opencog.org/wiki/Licenses
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program; if not, write to:
- * Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-
-#ifndef _OPENCOG_TABLE_IO_H
-#define _OPENCOG_TABLE_IO_H
-
-#include <fstream>
-#include <string>
-#include <vector>
-
-#include <boost/algorithm/string.hpp>
-#include <boost/range/algorithm/count.hpp>
-#include <boost/range/algorithm/binary_search.hpp>
-#include <boost/range/algorithm_ext/for_each.hpp>
-#include <boost/tokenizer.hpp>
-
-#include "table.h"
-#include "opencog/asmoses/combo/type_checker/type_tree.h"
-
-namespace opencog { namespace combo {
-
-/**
- * remove the carriage return (for DOS format)
- */
-void removeCarriageReturn(std::string& str);
-
-/**
- * remove non ASCII char at the begining of the string
- */
-void removeNonASCII(std::string& str);
-
-/**
- * Return true if the next chars in 'in' correspond to carriage return
- * (support UNIX and DOS format) and advance in of the checked chars.
- */
-bool checkCarriageReturn(std::istream& in);
-
-/**
- * Convert strings to typed values
- */
-builtin token_to_boolean(const std::string& token);
-contin_t token_to_contin(const std::string& token);
-vertex token_to_vertex(const type_node &tipe, const std::string& token);
-
-
-// ===========================================================
-
-typedef boost::tokenizer<boost::escaped_list_separator<char>> table_tokenizer;
-
-/**
- * Take a row, return a tokenizer.  Tokenization uses the
- * separator characters comma, blank, tab (',', ' ' or '\t').
- */
-table_tokenizer get_row_tokenizer(const std::string& line);
-
-/**
- * Take a line and return a vector containing the elements parsed.
- * Used by istreamTable.
- */
-template<typename T>
-static std::vector<T> tokenizeRow(
-    const std::string& line,
-    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>())
-{
-    table_tokenizer tok = get_row_tokenizer(line);
-    std::vector<T> res;
-    unsigned i = 0;
-    for (const std::string& t : tok) {
-
-        // trim away whitespace padding; failing to do this
-        // confuses stuff downstream.
-        std::string clean(t);
-        boost::trim(clean);
-
-        // Sometimes the tokenizer returns pure whitespace :-(
-        if (0 == clean.size()) continue;
-
-        if (!boost::binary_search(ignored_indices, i++))
-            res.push_back(boost::lexical_cast<T>(clean));
-    }
-    return res;
-}
-
-// ===========================================================
-
-//////////////////
-// istreamTable //
-//////////////////
-
-// some hacky function to get the header of a DSV file (assuming there is one)
-string_seq get_header(const std::string& input_file);
-
-std::istream& istreamRawITable(
-    std::istream& in, ITable& tab,
-    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>());
-
-std::istream& istreamITable(std::istream& in, ITable& tab,
-                           const string_seq& ignore_features);
-
-std::istream& istreamTable(std::istream& in, Table& tab,
-                           const std::string& target_feature,
-                           const std::string& timestamp_feature,
-                           const string_seq& ignore_features);
-
-// WARNING: this implementation only supports boolean ctable!!!!
-std::istream& istreamCompressedTable(std::istream& in, CompressedTable& ctable);
-
-/**
- * Load a OTable given the file name. Only works for dense DSV data.
- */
-OTable loadOTable(const std::string& file_name,
-                  const std::string& target_feature);
-
-// TODO: reimplement loadITable with the same model of loadTable and
-// remove loadITable_optimized
-ITable loadITable(
-    const std::string& file_name,
-    const string_seq& ignore_features=string_seq());
-
-ITable loadITable_optimized(
-    const std::string& file_name,
-    const string_seq& ignore_features=string_seq());
-
-/**
- * If target_feature is empty then, in case there is no header, it is
- * assumed to be the first feature.
- */
-Table loadTable(
-    const std::string& file_name,
-    const std::string& target_feature=std::string(),
-    const std::string& timestamp_feature=std::string(),
-    const string_seq& ignore_features=string_seq());
-
-std::istream& istreamDenseTable(std::istream& in, Table& tab,
-                                const std::string& target_feature,
-                                const std::string& timestamp_feature,
-                                const string_seq& ignore_features,
-                                const type_tree& tt, bool has_header);
-
-// WARNING: this implementation only supports boolean ctable!!!!
-CompressedTable loadCompressedTable(const std::string& file_name);
-
-//////////////////
-// ostreamTable //
-//////////////////
-
-/// output the header of a data table in CSV format.
-template<typename Out>
-Out& ostreamTableHeader(Out& out, const Table& table)
-{
-    // Add input features in header
-    string_seq header = table.itable.get_labels();
-    unsigned hsize = header.size();
-
-    // Add target feature in header
-    const std::string& ol = table.otable.get_label();
-    header.insert(header.begin() + std::min(table.target_pos, hsize), ol);
-
-    // Add timestamp feature in header
-    if (!table.ttable.empty()) {
-        const std::string& tl = table.ttable.get_label();
-        header.insert(header.begin() + table.timestamp_pos, tl);
-    }
-
-    // Write the header
-    ostream_container(out, header, ",") << std::endl;
-    return out;
-}
-
-/// Output a data table in CSV format. Boolean values are output in
-/// binary form (0 for false, 1 for true).
-template<typename Out>
-Out& ostreamTable(Out& out, const Table& table)
-{
-    // print header
-    ostreamTableHeader(out, table);
-
-    // print data
-    unsigned isize = table.itable.size(), osize = table.otable.size();
-    OC_ASSERT(table.itable.empty() || isize == osize);
-    for (size_t row = 0; row < osize; ++row) {
-        // Add input values
-        string_seq content;
-        if (!table.itable.empty())
-            content = table.itable[row].to_strings();
-        unsigned csize = content.size();
-
-        // Add target feature value
-        std::string oc = table_fmt_vertex_to_str(table.otable[row]);
-        content.insert(content.begin() + std::min(table.target_pos, csize), oc);
-
-        // Add timestamp feature value
-        if (!table.ttable.empty()) {
-            std::string tc = TTable::to_string(table.ttable[row]);
-            content.insert(content.begin() + table.timestamp_pos, tc);
-        }
-
-        // Write content row
-        ostream_container(out, content, ",") << std::endl;
-    }
-    return out;
-}
-
-/// like above but take a table instead of a input and output table
-void saveTable(const std::string& file_name, const Table& table);
-
-/// output a compressed table in pseudo CSV format
-std::ostream& ostreamCompressedTableRow(std::ostream& out, const CompressedTable::value_type& ctv);
-std::ostream& ostreamCompressedTable(std::ostream& out, const CompressedTable& ct);
-
-/// Output a compressed table with each row corresponding to a
-/// timestamp, chronologically ordered.
-std::ostream& ostreamCompressedTableTime(std::ostream& out, const CompressedTableTime& ctt);
-
-std::ostream& operator<<(std::ostream& out, const ITable& it);
-
-std::ostream& operator<<(std::ostream& out, const OTable& ot);
-
-std::ostream& operator<<(std::ostream& out, const Table& table);
-
-std::ostream& operator<<(std::ostream& out, const CompressedTable& ct);
-
-std::ostream& operator<<(std::ostream& out, const complete_truth_table& tt);
-
-} // ~namespaces combo
-
-// For pretty printing OpenCog objects while debugging, see
-// https://wiki.opencog.org/w/Development_standards#Pretty_Print_OpenCog_Objects
-std::string oc_to_string(const combo::ITable& it,
-                         const std::string& indent=empty_string);
-std::string oc_to_string(const combo::OTable& ot,
-                         const std::string& indent=empty_string);
-std::string oc_to_string(const combo::Table& table,
-                         const std::string& indent=empty_string);
-std::string oc_to_string(const combo::CompressedTable& ct,
-                         const std::string& indent=empty_string);
-std::string oc_to_string(const combo::complete_truth_table& tt,
-                         const std::string& indent=empty_string);
-
-} // ~namespaces opencog
-
-#endif // _OPENCOG_TABLE_IO_H
diff --git a/opencog/persist/csv/table_io.cc b/opencog/persist/csv/table_read.cc
similarity index 84%
rename from opencog/persist/csv/table_io.cc
rename to opencog/persist/csv/table_read.cc
index 1f80c8fdc6..2d2181449e 100644
--- a/opencog/persist/csv/table_io.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -1,7 +1,8 @@
-/** table_io.cc ---
+/** table_read.cc --
  *
  * Copyright (C) 2010 OpenCog Foundation
  * Copyright (C) 2012 Poulin Holdings LLC
+ * Copyright (C) 2022 Linas Vepstas
  *
  * Authors: Nil Geisweiller <ngeiswei@gmail.com>
  *          Linas Vepstas <linasvepstas@gmail.com>
@@ -41,10 +42,9 @@
 #include <opencog/util/oc_omp.h>
 #include <opencog/util/comprehension.h>
 
-#include "table.h"
-#include "table_io.h"
+#include "table_read.h"
 
-namespace opencog { namespace combo {
+namespace opencog {
 
 using namespace std;
 using namespace boost;
@@ -125,7 +125,7 @@ static const char *sparse_delim = " : ";
 /**
  * parse a pair of key/value in a parse dataset, using ':' as
  * delimiter. For instance
- * 
+ *
  * parse_key_val("key : val")
  *
  * returns
@@ -148,7 +148,7 @@ parse_key_val(string chunk)
     boost::trim(val);
     return {key, val};
 }
-        
+
 /**
  * Take a row, return a tokenizer.  Tokenization uses the
  * separator characters comma, blank, tab (',', ' ' or '\t').
@@ -231,7 +231,7 @@ type_node infer_type_from_token(const string& token)
  * Compare this to 'curr_guess', and upgrade the type inference
  * if it can be done consistently.
  */
-static type_node 
+static type_node
 infer_type_from_token2(type_node curr_guess, const string& token)
 {
     type_node tokt = infer_type_from_token(token);
@@ -508,21 +508,21 @@ struct from_sparse_tokens_visitor : public from_tokens_visitor
  * The sparse table format consists of some fixed number of columns,
  * in comma-separated format, followed by key-value pairs, also
  * tab-separated. viz:
- * 
+ *
  *     val, val, val, name:val, name:val, name:val
- * 
- * Thus, for example, a row such as 
- * 
+ *
+ * Thus, for example, a row such as
+ *
  *    earn, issued : 1, results : 2, ending : 1, including : 1
- * 
+ *
  * indicates that there one fixed column, of enum type, (the enum value
- * being "earn"), and that features called "issued", "ending" and 
+ * being "earn"), and that features called "issued", "ending" and
  * "including" have a contin value of 1.0  and "results" has a contin
  * value of 2.
- * 
+ *
  * The routine does NOT store the table in sparse format: it stores the
  * full, exploded table. This could be bad ...
- * TODO: we really need a sparse table format, as well.  
+ * TODO: we really need a sparse table format, as well.
  *
  * The "Raw" format has all data as strings; type conversion to the
  * appropriate type, must all be done as a separate step.
@@ -556,7 +556,7 @@ istream& istreamSparseITable(istream& in, ITable& tab)
     if (0 == fixed_arity) {
         vector<string> fixy = tokenizeSparseRow(lines[0]);
         // count commas, until a semi-colon is found.
-        while (string::npos == fixy[fixed_arity].find(sparse_delim)) 
+        while (string::npos == fixy[fixed_arity].find(sparse_delim))
             fixed_arity++;
     }
     logger().info() << "Sparse file fixed column count=" << fixed_arity;
@@ -762,20 +762,8 @@ istream& istreamITable_ignore_indices(istream& in, ITable& tab,
                         [&](multi_type_seq& seq) {
                             return aft(seq.get_variant());
                         });
-    
-    return in;
-}
-
-OTable loadOTable(const string& file_name, const string& target_feature)
-{
-    vector<string> ignore_features;
-    for (const string& l : get_header(file_name))
-        if (l != target_feature)
-            ignore_features.push_back(l);
 
-    ITable itab = loadITable(file_name, ignore_features);
-    OTable res(itab.get_column_data(target_feature), target_feature);
-    return res;
+    return in;
 }
 
 /**
@@ -834,7 +822,7 @@ ITable loadITable_optimized(const string& file_name,
     // determined ignore_indices
     vector<unsigned> ignore_indices = get_indices(ignore_features,
                                                   get_header(file_name));
-    
+
     ITable res;
     istreamITable_ignore_indices(in, res, ignore_indices);
     return res;
@@ -860,12 +848,12 @@ istream& istreamTable_OLD(istream& in, Table& tab,
     istreamITable(in, tab.itable, ignore_features);
 
     tab.otable = tab.itable.get_column_data(target_feature);
-    OC_ASSERT(0 != tab.otable.size(), 
+    OC_ASSERT(0 != tab.otable.size(),
               "Fatal Error: target feature \"%s\" not found",
               target_feature.c_str());
 
     tab.target_pos = tab.itable.get_column_offset(target_feature);
-    
+
     type_node targ_type = tab.itable.get_type(target_feature);
 
     string targ_feat = tab.itable.delete_column(target_feature);
@@ -885,16 +873,16 @@ istream& istreamTable_OLD(istream& in, Table& tab,
 istream& istreamTable_ignore_indices(istream& in, Table& tab,
                                      const string& target_feature,
                                      const vector<unsigned>& ignore_indices)
-{    
+{
     istreamITable_ignore_indices(in, tab.itable, ignore_indices);
 
     tab.otable = tab.itable.get_column_data(target_feature);
-    OC_ASSERT(0 != tab.otable.size(), 
+    OC_ASSERT(0 != tab.otable.size(),
               "Fatal Error: target feature \"%s\" not found",
               target_feature.c_str());
 
     tab.target_pos = tab.itable.get_column_offset(target_feature);
-    
+
     type_node targ_type = tab.itable.get_type(target_feature);
 
     string targ_feat = tab.itable.delete_column(target_feature);
@@ -1067,7 +1055,7 @@ istream& istreamTable(istream& in, Table& tab,
  */
 template<typename T>
 std::pair<std::vector<T>, T>
-tokenizeRowIO(
+tokenizeRowIO (
     const std::string& line,
     const std::vector<unsigned>& ignored_indices=std::vector<unsigned>(),
     unsigned target_idx=0)
@@ -1227,72 +1215,6 @@ istream& istreamDenseTable(istream& in, Table& tab,
 
 // ==================================================================
 
-// Parse a CompressedTable row
-// TODO: implement timestamp support
-CompressedTable::value_type parseCompressedTableRow(const type_tree& tt, const std::string& row_str)
-{
-    // split the string between input and output
-    unsigned end_outputs_pos = row_str.find("}");
-    string outputs = row_str.substr(1, end_outputs_pos - 1),
-        inputs = row_str.substr(end_outputs_pos + 2); // +2 to go
-                                                      // passed the
-                                                      // following ,
-
-    // convert the inputs string into multi_type_seq
-    type_node_seq tns = vector_comp(get_signature_inputs(tt), get_type_node);
-    vector<string> input_seq = tokenizeRow<string>(inputs);
-    from_tokens_visitor ftv(tns);
-    multi_type_seq input_values = ftv(input_seq);
-
-    // convert the outputs string into CompressedTable::counter_t
-    vector<string> output_pair_seq = tokenizeRow<string>(outputs);
-    CompressedTable::counter_t counter;
-    for (const string& pair_str : output_pair_seq) {
-        unsigned sep_pos = pair_str.find(":");
-        string key_str = pair_str.substr(0, sep_pos),
-            value_str = pair_str.substr(sep_pos + 1);
-        vertex v = token_to_vertex(get_type_node(get_signature_output(tt)),
-                                   key_str);
-        count_t count = atof(value_str.c_str());
-        counter[TimedValue(v)] = count;
-    }
-    return CompressedTable::value_type(input_values, counter);
-}
-
-// WARNING: this implementation only supports boolean ctable!!!!
-std::istream& istreamCompressedTable(std::istream& in, CompressedTable& ctable)
-{
-    ////////////////
-    // set header //
-    ////////////////
-    string header_line;
-    get_data_line(in, header_line);
-    auto labels = tokenizeRow<string>(header_line);
-    ctable.set_labels(labels);
-
-    ////////////////////////
-    // set type signature //
-    ////////////////////////
-    // HACK THIS PART TO MAKE IT SUPPORT OTHER TYPES THAN BOOLEAN
-    ctable.set_signature(gen_signature(id::boolean_type, ctable.get_arity()));
-
-    /////////////////
-    // set content //
-    /////////////////
-    std::vector<string> lines;
-    // read the entire file
-    {
-        string line;
-        while (get_data_line(in, line))
-            lines.push_back(line);
-    }
-    // parse each line and fill the ctable
-    for (const string& line : lines)
-        ctable.insert(parseCompressedTableRow(ctable.get_signature(), line));
-
-    return in;
-}
-
 Table loadTable(const std::string& file_name,
                 const std::string& target_feature,
                 const std::string& timestamp_feature,
@@ -1307,173 +1229,4 @@ Table loadTable(const std::string& file_name,
     return res;
 }
 
-CompressedTable loadCompressedTable(const string& file_name)
-{
-    CompressedTable ctable;
-    OC_ASSERT(!file_name.empty(), "No filename specified!");
-    ifstream in(file_name.c_str());
-    istreamCompressedTable(in, ctable);
-    return ctable;
-}
-
-// ===========================================================
-// ostream regular tables
-
-void saveTable(const string& file_name, const Table& table)
-{
-    OC_ASSERT(!file_name.empty(), "No filename specified!");
-    ofstream out(file_name.c_str());
-    OC_ASSERT(out.is_open(), "Could not open %s", file_name.c_str());
-    ostreamTable(out, table);
-}
-
-// ===========================================================
-// ostream CompressedTables
-
-ostream& ostreamCompressedTableHeader(ostream& out, const CompressedTable& ct)
-{
-    return ostreamln_container(out, ct.get_labels(), ",");
-}
-
-ostream& ostreamCompressedTableRow(ostream& out, const CompressedTable::value_type& ctv)
-{
-    to_strings_visitor tsv;
-    auto ats = boost::apply_visitor(tsv);
-    // print map of outputs
-    out << "{";
-    for(auto it = ctv.second.cbegin(); it != ctv.second.cend();) {
-        if (it->first.timestamp != boost::gregorian::date())
-            out << "(" << table_fmt_vertex_to_str(it->first.value)
-                << "," << it->first.timestamp << "):" << it->second;
-        else
-            out << table_fmt_vertex_to_str(it->first.value)
-                << ":" << it->second;
-        if (++it != ctv.second.cend())
-            out << ",";
-    }
-    out << "},";
-    // print inputs
-    return ostreamln_container(out, ats(ctv.first.get_variant()), ",");
-}
-
-ostream& ostreamCompressedTable(ostream& out, const CompressedTable& ct)
-{
-    // print header
-    ostreamCompressedTableHeader(out, ct);
-    // print data
-    for (const auto& v : ct)
-        ostreamCompressedTableRow(out, v);
-
-    return out;
-}
-
-ostream& ostreamCompressedTableTimeHeader(ostream& out, const CompressedTableTime& ctt)
-{
-    out << "timestamp,output" << endl;
-    return out;
-}
-
-ostream& ostreamCompressedTableTimeRow(ostream& out, const CompressedTableTime::value_type& tio)
-{
-    out << tio.first << ",{";
-    for (auto it = tio.second.cbegin(); it != tio.second.cend();) {
-        out << table_fmt_vertex_to_str(it->first)
-            << ":" << it->second;
-        if(++it != tio.second.cend())
-            out << ",";
-    }
-    out << "}" << endl;
-    return out;
-}
-
-ostream& ostreamCompressedTableTime(ostream& out, const CompressedTableTime& ctt)
-{
-    // print header
-    ostreamCompressedTableTimeHeader(out, ctt);
-
-    // print data by time
-    for (const auto& tio : ctt)
-        ostreamCompressedTableTimeRow(out, tio);
-
-    return out;
-}
-
-// ===========================================================
-// operator<< for the various tables and stuff.
-
-ostream& operator<<(ostream& out, const ITable& it)
-{
-    ostreamln_container(out, it.get_labels(), ",");
-    ostreamln_container(out, it.get_types(), ",");
-    to_strings_visitor tsv;
-    for (const auto& row : it) {
-        vector<string> row_str = boost::apply_visitor(tsv, row.get_variant());
-        ostreamln_container(out, row_str, ",");
-    }
-    return out;
-}
-
-ostream& operator<<(ostream& out, const OTable& ot)
-{
-    if (!ot.get_label().empty())
-        out << ot.get_label() << endl;
-    out << ot.get_type() << endl;
-    for (const vertex& v : ot)
-        out << table_fmt_vertex_to_str(v) << endl;
-    return out;
-}
-
-ostream& operator<<(ostream& out, const Table& table)
-{
-    return ostreamTable(out, table);
-}
-
-ostream& operator<<(ostream& out, const complete_truth_table& tt)
-{
-    return ostream_container(out, tt);
-}
-
-ostream& operator<<(ostream& out, const CompressedTable& ct)
-{
-    return ostreamCompressedTable(out, ct);
-}
-
-} // ~namespaces combo
-
-std::string oc_to_string(const combo::ITable& it, const std::string& indent)
-{
-    std::stringstream ss;
-    ss << it;
-    return ss.str();
-}
-
-std::string oc_to_string(const combo::OTable& ot, const std::string& indent)
-{
-    std::stringstream ss;
-    ss << ot;
-    return ss.str();
-}
-
-std::string oc_to_string(const combo::Table& table, const std::string& indent)
-{
-    std::stringstream ss;
-    ss << table;
-    return ss.str();
-}
-
-std::string oc_to_string(const combo::CompressedTable& ct, const std::string& indent)
-{
-    std::stringstream ss;
-    ss << ct;
-    return ss.str();
-}
-
-std::string oc_to_string(const combo::complete_truth_table& tt,
-                         const std::string& indent)
-{
-    std::stringstream ss;
-    ss << tt;
-    return ss.str();
-}
-
 } // ~namespaces opencog
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
new file mode 100644
index 0000000000..b5ac544a50
--- /dev/null
+++ b/opencog/persist/csv/table_read.h
@@ -0,0 +1,143 @@
+/**
+ * table_read.h -- Read a CSV/TSV table
+ *
+ * Copyright (C) 2010 OpenCog Foundation
+ * Copyright (C) 2012 Poulin Holdings LLC
+ * Copyright (C) 2022 Linas Vepstas
+ *
+ * Authors: Nil Geisweiller <ngeiswei@gmail.com>
+ *          Linas Vepstas <linasvepstas@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License v3 as
+ * published by the Free Software Foundation and including the exceptions
+ * at http://opencog.org/wiki/Licenses
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _ATOMESE_TABLE_READ_H
+#define _ATOMESE_TABLE_READ_H
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/range/algorithm/count.hpp>
+#include <boost/range/algorithm/binary_search.hpp>
+#include <boost/range/algorithm_ext/for_each.hpp>
+#include <boost/tokenizer.hpp>
+
+namespace opencog {
+
+/**
+ * remove the carriage return (for DOS format)
+ */
+void removeCarriageReturn(std::string& str);
+
+/**
+ * remove non ASCII char at the begining of the string
+ */
+void removeNonASCII(std::string& str);
+
+/**
+ * Return true if the next chars in 'in' correspond to carriage return
+ * (support UNIX and DOS format) and advance in of the checked chars.
+ */
+bool checkCarriageReturn(std::istream& in);
+
+/**
+ * Convert strings to typed values
+ */
+builtin token_to_boolean(const std::string& token);
+contin_t token_to_contin(const std::string& token);
+vertex token_to_vertex(const type_node &tipe, const std::string& token);
+
+
+// ===========================================================
+
+typedef boost::tokenizer<boost::escaped_list_separator<char>> table_tokenizer;
+
+/**
+ * Take a row, return a tokenizer.  Tokenization uses the
+ * separator characters comma, blank, tab (',', ' ' or '\t').
+ */
+table_tokenizer get_row_tokenizer(const std::string& line);
+
+/**
+ * Take a line and return a vector containing the elements parsed.
+ */
+template<typename T>
+static std::vector<T> tokenizeRow (
+    const std::string& line,
+    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>())
+{
+    table_tokenizer tok = get_row_tokenizer(line);
+    std::vector<T> res;
+    unsigned i = 0;
+    for (const std::string& t : tok) {
+
+        // trim away whitespace padding; failing to do this
+        // confuses stuff downstream.
+        std::string clean(t);
+        boost::trim(clean);
+
+        // Sometimes the tokenizer returns pure whitespace :-(
+        if (0 == clean.size()) continue;
+
+        if (!boost::binary_search(ignored_indices, i++))
+            res.push_back(boost::lexical_cast<T>(clean));
+    }
+    return res;
+}
+
+// ===========================================================
+
+// Get the header of a DSV file (assuming there is one)
+string_seq get_header(const std::string& input_file);
+
+std::istream& istreamRawITable(
+    std::istream& in, ITable& tab,
+    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>());
+
+std::istream& istreamITable(std::istream& in, ITable& tab,
+                           const string_seq& ignore_features);
+
+std::istream& istreamTable(std::istream& in, Table& tab,
+                           const string_seq& ignore_features);
+
+// TODO: reimplement loadITable with the same model of loadTable and
+// remove loadITable_optimized
+ITable loadITable(
+    const std::string& file_name,
+    const string_seq& ignore_features=string_seq());
+
+ITable loadITable_optimized(
+    const std::string& file_name,
+    const string_seq& ignore_features=string_seq());
+
+/**
+ * If target_feature is empty then, in case there is no header, it is
+ * assumed to be the first feature.
+ */
+Table loadTable(
+    const std::string& file_name,
+    const string_seq& ignore_features=string_seq());
+
+std::istream& istreamDenseTable(std::istream& in, Table& tab,
+                                const string_seq& ignore_features,
+                                const type_tree& tt, bool has_header);
+
+
+} // ~namespaces opencog
+
+#endif // _ATOMESE_TABLE_READ_H

From 345de2bb2ac58ffc735ebd2230bf2a6603c144b1 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 12:25:28 +0300
Subject: [PATCH 05/56] Add Makefile.

---
 opencog/persist/CMakeLists.txt     |  1 +
 opencog/persist/csv/CMakeLists.txt | 25 +++++++++++++++++++++++++
 opencog/persist/csv/load_csv.cc    |  4 ++--
 3 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 opencog/persist/csv/CMakeLists.txt

diff --git a/opencog/persist/CMakeLists.txt b/opencog/persist/CMakeLists.txt
index 9fd966d7bb..ea24c420ad 100644
--- a/opencog/persist/CMakeLists.txt
+++ b/opencog/persist/CMakeLists.txt
@@ -1,5 +1,6 @@
 ADD_SUBDIRECTORY (storage)
 ADD_SUBDIRECTORY (api)
+ADD_SUBDIRECTORY (csv)
 
 IF (HAVE_GEARMAN AND HAVE_GUILE)
 	ADD_SUBDIRECTORY (gearman)
diff --git a/opencog/persist/csv/CMakeLists.txt b/opencog/persist/csv/CMakeLists.txt
new file mode 100644
index 0000000000..d358336b58
--- /dev/null
+++ b/opencog/persist/csv/CMakeLists.txt
@@ -0,0 +1,25 @@
+
+# Generic JSON decoding.
+ADD_LIBRARY (csv
+	load_csv.cc
+	table_read.cc
+)
+
+ADD_DEPENDENCIES(csv opencog_atom_types)
+
+TARGET_LINK_LIBRARIES(csv
+	atomspace
+	atombase
+	${COGUTIL_LIBRARY}
+)
+
+INSTALL (TARGETS csv EXPORT AtomSpaceTargets
+	DESTINATION "lib${LIB_DIR_SUFFIX}/opencog"
+)
+
+INSTALL (FILES
+	load_csv.h
+	DESTINATION "include/opencog/persist/csv"
+)
+
+# -------------------------------
diff --git a/opencog/persist/csv/load_csv.cc b/opencog/persist/csv/load_csv.cc
index f69885c879..56405b3348 100644
--- a/opencog/persist/csv/load_csv.cc
+++ b/opencog/persist/csv/load_csv.cc
@@ -1,4 +1,4 @@
-/** 
+/**
  * load_csv.cc -- Load CSV tables into Values
  *
  * Copyright (C) 2022 Linas Vepstas
@@ -25,7 +25,7 @@ using namespace opencog;
  * Load columns from a CSV file and place them into Atomese Values on
  * the indicated Atom. Atomese Values are vectors (of floats, bools,
  * srings, or more complex structures). Each Value holds one column
- * from the dataset. 
+ * from the dataset.
  *
  * The features (columns) specified in ignore_features will be omitted
  * from the representation.

From ca52b37e4e2453af9722aa8a6b5878728dc5f768 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 12:36:28 +0300
Subject: [PATCH 06/56] Include AtomSpace

---
 opencog/persist/csv/load_csv.cc | 5 +++++
 opencog/persist/csv/load_csv.h  | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/opencog/persist/csv/load_csv.cc b/opencog/persist/csv/load_csv.cc
index 56405b3348..6c55f5f000 100644
--- a/opencog/persist/csv/load_csv.cc
+++ b/opencog/persist/csv/load_csv.cc
@@ -19,6 +19,11 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include <string>
+
+#include <opencog/atomspace/AtomSpace.h>
+#include "load_csv.h"
+
 using namespace opencog;
 
 /**
diff --git a/opencog/persist/csv/load_csv.h b/opencog/persist/csv/load_csv.h
index a28a0b56ae..f073d6336b 100644
--- a/opencog/persist/csv/load_csv.h
+++ b/opencog/persist/csv/load_csv.h
@@ -25,6 +25,8 @@
 #ifndef _ATOMESE_LOAD_CSV_H
 #define _ATOMESE_LOAD_CSV_H
 
+#include <opencog/atomspace/AtomSpace.h>
+
 namespace opencog {
 
 // Load columns from a CSV file and place them into Atomese Values on

From 7a3e3cf3937ed7c2573d7bf2cc9a3a1b792f3cbd Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 12:59:06 +0300
Subject: [PATCH 07/56] Convert bool and contin types to Values

---
 opencog/persist/csv/table_read.cc | 96 ++++++++++++++-----------------
 opencog/persist/csv/table_read.h  |  8 ++-
 2 files changed, 48 insertions(+), 56 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 2d2181449e..1006e20b60 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -42,6 +42,10 @@
 #include <opencog/util/oc_omp.h>
 #include <opencog/util/comprehension.h>
 
+#include <opencog/atoms/value/BoolValue.h>
+#include <opencog/atoms/value/FloatValue.h>
+#include <opencog/atoms/value/StringValue.h>
+
 #include "table_read.h"
 
 namespace opencog {
@@ -259,64 +263,50 @@ infer_type_from_token2(type_node curr_guess, const string& token)
 }
 
 /// cast string "token" to a vertex of type "tipe"
-builtin token_to_boolean(const string& token)
+ValuePtr token_to_boolean(const string& token)
 {
-    if ("0" == token || "F" == token || "f" == token)
-        return id::logical_false;
-    else if ("1" == token || "T" == token || "t" == token)
-        return id::logical_true;
-    else {
-        OC_ASSERT(false, "Expecting boolean value, got %s", token.c_str());
-        return builtin();
-    }
+	if ("0" == token || "F" == token || "f" == token)
+		return createBoolValue(false);
+
+	if ("1" == token || "T" == token || "t" == token)
+		return createBoolValue(true);
+
+	throw RuntimeError(TRACE_INFO,
+		"Expecting boolean value, got %s", token.c_str());
 }
-contin_t token_to_contin(const string& token)
+
+ValuePtr token_to_contin(const string& token)
 {
-    try {
-        return lexical_cast<contin_t>(token);
-    } catch(boost::bad_lexical_cast&) {
-        OC_ASSERT(false, "Could not cast %s to contin", token.c_str());
-        return contin_t();
-    }
+	try {
+		return createFloatValue(lexical_cast<double>(token));
+	} catch (boost::bad_lexical_cast&) {
+		throw RuntimeError(TRACE_INFO,
+			"Could not cast %s to floating point", token.c_str());
+	}
 }
-vertex token_to_vertex(const type_node &tipe, const string& token)
-{
-    switch (tipe) {
-
-    case id::boolean_type:
-        return token_to_boolean(token);
-
-    case id::contin_type:
-        return token_to_contin(token);
-
-    case id::enum_type:
-        // Enum types must begin with an alpha character
-        if (isalpha(token[0]))
-            return enum_t(token);
-        OC_ASSERT(false, "Enum type must begin with alphabetic char, but %s doesn't", token.c_str());
-        break;
-
-    case id::definite_object_type:
-        return token;
-        break;
-
-    // Ugly hack ... the problem adressed here is that feature
-    // selection has to read and propagate columns of unknown type
-    // (typically, dates, times).  So we hack around this here.
-    case id::ill_formed_type:
-        return enum_t(token);
-        // return id::ill_formed_type;
-        // return id::null_vertex;
-        break;
-
-    default:
-        stringstream ss;
-        ss << "Unable to convert token \"" << token << "\" to type=" << tipe << endl;
-        OC_ASSERT(0, ss.str().c_str());
-    }
 
-    // unreachable
-    return id::null_vertex;
+ValuePtr token_to_vertex(Type tipe, const std::string& token)
+{
+	if (BOOL_VALUE == tipe)
+		return token_to_boolean(token);
+
+	if (FLOAT_VALUE == tipe)
+		return token_to_contin(token);
+
+	if (STRING_VALUE == tipe)
+	{
+		// Enum types must begin with an alpha character
+		if (isalpha(token[0]))
+			return createStringValue(token);
+
+		throw RuntimeError(TRACE_INFO,
+			"Enum type must begin with alphabetic char, but %s doesn't",
+			token.c_str());
+	}
+
+	stringstream ss;
+	ss << "Unable to convert token \"" << token << "\" to type=" << tipe << endl;
+	throw RuntimeError(TRACE_INFO, "%s", ss.str().c_str());
 }
 
 // ===========================================================
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index b5ac544a50..0030997c86 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -37,6 +37,8 @@
 #include <boost/range/algorithm_ext/for_each.hpp>
 #include <boost/tokenizer.hpp>
 
+#include <opencog/atoms/value/Value.h>
+
 namespace opencog {
 
 /**
@@ -58,9 +60,9 @@ bool checkCarriageReturn(std::istream& in);
 /**
  * Convert strings to typed values
  */
-builtin token_to_boolean(const std::string& token);
-contin_t token_to_contin(const std::string& token);
-vertex token_to_vertex(const type_node &tipe, const std::string& token);
+ValuePtr token_to_boolean(const std::string&);
+ValuePtr token_to_contin(const std::string&);
+ValuePtr token_to_vertex(Type, const std::string&);
 
 
 // ===========================================================

From c1e78242dfbe503440c58b9c06811999948b72e9 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 13:05:21 +0300
Subject: [PATCH 08/56] Define what string_seq is

---
 opencog/persist/csv/table_read.cc | 13 ++++++-------
 opencog/persist/csv/table_read.h  |  1 +
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 1006e20b60..7d9c8418a7 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -50,7 +50,6 @@
 
 namespace opencog {
 
-using namespace std;
 using namespace boost;
 using namespace boost::phoenix;
 using boost::phoenix::arg_names::arg1;
@@ -105,9 +104,9 @@ bool is_comment(const char c)
 //
 // The signature of this routine is the same as std:getline()
 //
-istream &get_data_line(istream& is, string& line)
+std::istream& get_data_line(std::istream& is, std::string& line)
 {
-    while (1)
+    while (true)
     {
         getline(is, line);
         if (!is) return is;
@@ -382,12 +381,12 @@ istream& istreamRawITable(istream& in, ITable& tab,
     return in;
 }
 
-vector<string> get_header(const string& file_name)
+std::vector<std::string> get_header(const std::string& file_name)
 {
-    ifstream in(file_name.c_str());
-    string line;
+    std::ifstream in(file_name.c_str());
+    std::string line;
     get_data_line(in, line);
-    return tokenizeRow<string>(line);
+    return tokenizeRow<std::string>(line);
 }
 
 // ===========================================================
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index 0030997c86..46c5f72c7b 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -103,6 +103,7 @@ static std::vector<T> tokenizeRow (
 }
 
 // ===========================================================
+typedef std::vector<std::string> string_seq;
 
 // Get the header of a DSV file (assuming there is one)
 string_seq get_header(const std::string& input_file);

From 1e311f7272e620887f2c8d296d771d363697c975 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 13:33:30 +0300
Subject: [PATCH 09/56] std namespace conversion for strings

---
 opencog/persist/csv/table_read.cc | 94 +++++++++++++++----------------
 opencog/persist/csv/table_read.h  | 21 +++++--
 2 files changed, 62 insertions(+), 53 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 7d9c8418a7..ea0d621e92 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -56,7 +56,7 @@ using boost::phoenix::arg_names::arg1;
 
 // -------------------------------------------------------
 
-bool checkCarriageReturn(istream& in)
+bool checkCarriageReturn(std::istream& in)
 {
     char next_c = in.get();
     if (next_c == '\r') // DOS format
@@ -66,7 +66,7 @@ bool checkCarriageReturn(istream& in)
     return false;
 }
 
-void removeCarriageReturn(string& str)
+void removeCarriageReturn(std::string& str)
 {
     size_t s = str.size();
     if ((s > 0) && (str[s-1] == '\r'))
@@ -74,7 +74,7 @@ void removeCarriageReturn(string& str)
 }
 
 //* Remove non-ascii characters at the bigining of the line, only.
-void removeNonASCII(string& str)
+void removeNonASCII(std::string& str)
 {
     while (str.size() && (unsigned char)str[0] > 127)
         str = str.substr(1);
@@ -138,16 +138,16 @@ static const char *sparse_delim = " : ";
  * If no such delimiter is found then it return a pair with empty key
  * and empty val.
  */
-static pair<string, string>
-parse_key_val(string chunk)
+static std::pair<std::string, std::string>
+parse_key_val(const std::string& chunk)
 {
-    pair<string, string> res;
+    std::pair<std::string, std::string> res;
     size_t pos = chunk.find(sparse_delim);
-    if (string::npos == pos)
+    if (std::string::npos == pos)
         return res;
-    string key = chunk.substr(0, pos);
+    std::string key = chunk.substr(0, pos);
     boost::trim(key);
-    string val = chunk.substr(pos + strlen(sparse_delim));
+    std::string val = chunk.substr(pos + strlen(sparse_delim));
     boost::trim(val);
     return {key, val};
 }
@@ -167,7 +167,7 @@ table_tokenizer get_row_tokenizer(const std::string& line)
 }
 
 // Same as above, but only allow commas as a column separator.
-table_tokenizer get_sparse_row_tokenizer(const string& line)
+table_tokenizer get_sparse_row_tokenizer(const std::string& line)
 {
     typedef boost::escaped_list_separator<char> separator;
     typedef boost::tokenizer<separator> tokenizer;
@@ -182,7 +182,7 @@ table_tokenizer get_sparse_row_tokenizer(const string& line)
  * Used by istreamTable. This will modify the line to remove leading
  * non-ASCII characters, as well as stripping of any carriage-returns.
  */
-vector<string> tokenizeSparseRow(const string& line)
+vector<string> tokenizeSparseRow(const std::string& line)
 {
     table_tokenizer tok = get_sparse_row_tokenizer(line);
     vector<string> res;
@@ -198,7 +198,7 @@ vector<string> tokenizeSparseRow(const string& line)
  * Given an input string, guess the type of the string.
  * Inferable types are: boolean, contin and enum.
  */
-type_node infer_type_from_token(const string& token)
+type_node infer_type_from_token(const std::string& token)
 {
     /* Prefered representation is T's and 0's, to maximize clarity,
      * readability.  Numeric values are easily confused with contin
@@ -235,7 +235,7 @@ type_node infer_type_from_token(const string& token)
  * if it can be done consistently.
  */
 static type_node
-infer_type_from_token2(type_node curr_guess, const string& token)
+infer_type_from_token2(type_node curr_guess, const std::string& token)
 {
     type_node tokt = infer_type_from_token(token);
 
@@ -262,7 +262,7 @@ infer_type_from_token2(type_node curr_guess, const string& token)
 }
 
 /// cast string "token" to a vertex of type "tipe"
-ValuePtr token_to_boolean(const string& token)
+ValuePtr token_to_boolean(const std::string& token)
 {
 	if ("0" == token || "F" == token || "f" == token)
 		return createBoolValue(false);
@@ -274,7 +274,7 @@ ValuePtr token_to_boolean(const string& token)
 		"Expecting boolean value, got %s", token.c_str());
 }
 
-ValuePtr token_to_contin(const string& token)
+ValuePtr token_to_contin(const std::string& token)
 {
 	try {
 		return createFloatValue(lexical_cast<double>(token));
@@ -325,7 +325,7 @@ istream& istreamRawITable(istream& in, ITable& tab,
     streampos beg = in.tellg();
 
     // Get the entire dataset into memory
-    string line;
+    std::string line;
     std::vector<string> lines;
 
     // Read first few by hand. The first might be labels, so we must
@@ -525,7 +525,7 @@ istream& istreamSparseITable(istream& in, ITable& tab)
     // ... unless it isn't. (The header must not contain a colon).
     vector<string> labs;
     size_t fixed_arity = 0;
-    string header;
+    std::string header;
     get_data_line(in, header);
     if (string::npos == header.find(sparse_delim)) {
         // Determine the arity of the fixed columns
@@ -538,7 +538,7 @@ istream& istreamSparseITable(istream& in, ITable& tab)
     }
 
     // Get the entire dataset into memory
-    string iline;
+    std::string iline;
     while (get_data_line(in, iline))
         lines.push_back(iline);
 
@@ -558,7 +558,7 @@ istream& istreamSparseITable(istream& in, ITable& tab)
     // Fixed features may have different types, by column.
     type_node_seq types(fixed_arity, id::unknown_type);
 
-    for (const string& line : lines) {
+    for (const std::string& line : lines) {
         vector<string> chunks = tokenizeSparseRow(line);
         vector<string>::const_iterator pit = chunks.begin();
 
@@ -570,7 +570,7 @@ istream& istreamSparseITable(istream& in, ITable& tab)
         for (; pit != chunks.end(); ++pit) {
             // Rip out the key-value pairs
             auto key_val = parse_key_val(*pit);
-            if (key_val == pair<string, string>())
+            if (key_val == pair<string, std::string>())
                 break;
             // Store the key, uniquely.  Store best guess as the type.
             feats.insert(key_val.first);
@@ -584,8 +584,8 @@ istream& istreamSparseITable(istream& in, ITable& tab)
     // Convert the feature set into a list of labels.
     // 'index' is a map from feature name to column number.
     size_t cnt = fixed_arity;
-    map<const string, size_t> index;
-    for (const string& key : feats) {
+    std::map<const std::string, size_t> index;
+    for (const std::string& key : feats) {
         types.push_back(feat_type);
         labs.push_back(key);
         index[key] = cnt;
@@ -598,7 +598,7 @@ istream& istreamSparseITable(istream& in, ITable& tab)
     from_sparse_tokens_visitor fstv(types, index, fixed_arity);
     auto fill_line = [&](int i)
     {
-        const string& line = lines[i];
+        const std::string& line = lines[i];
         // Tokenize the line
         vector<string> chunks = tokenizeSparseRow(line);
         multi_type_seq row = fstv(chunks);
@@ -759,18 +759,18 @@ istream& istreamITable_ignore_indices(istream& in, ITable& tab,
  * Take a line and return a triple with vector containing the input
  * elements, output element and timestamp.
  */
-std::tuple<vector<string>, string, string>
+std::tuple<std::vector<std::string>, std::string, std::string>
 tokenizeRowIOT(const std::string& line,
                const std::vector<unsigned>& ignored_indices,
                int target_idx,  // < 0 == ignored
                int timestamp_idx) // < 0 == ignored
 {
-    std::tuple<std::vector<string>, string, string> res;
+    std::tuple<std::vector<std::string>, std::string, std::string> res;
     table_tokenizer toker = get_row_tokenizer(line);
     int i = 0;
     for (const std::string& tok : toker) {
         if (!boost::binary_search(ignored_indices, i)) {
-            string el = boost::lexical_cast<string>(tok);
+            std::string el = boost::lexical_cast<string>(tok);
             if (target_idx == i)
                 std::get<1>(res) = el;
             else if (timestamp_idx == i)
@@ -783,7 +783,7 @@ tokenizeRowIOT(const std::string& line,
     return res;
 }
 
-ITable loadITable(const string& file_name,
+ITable loadITable(const std::string& file_name,
                   const vector<string>& ignore_features)
 {
     OC_ASSERT(!file_name.empty(), "the file name is empty");
@@ -801,7 +801,7 @@ ITable loadITable(const string& file_name,
  *
  * WARNING: it assumes the dataset has a header!!!
  */
-ITable loadITable_optimized(const string& file_name,
+ITable loadITable_optimized(const std::string& file_name,
                             const vector<string>& ignore_features)
 {
     OC_ASSERT(!file_name.empty(), "the file name is empty");
@@ -831,8 +831,8 @@ ITable loadITable_optimized(const string& file_name,
  * This is only used for sparse table and could be optimized
  */
 istream& istreamTable_OLD(istream& in, Table& tab,
-                          const string& target_feature,
-                          const vector<string>& ignore_features)
+                          const std::string& target_feature,
+                          const std::vector<std::string>& ignore_features)
 {
     istreamITable(in, tab.itable, ignore_features);
 
@@ -845,7 +845,7 @@ istream& istreamTable_OLD(istream& in, Table& tab,
 
     type_node targ_type = tab.itable.get_type(target_feature);
 
-    string targ_feat = tab.itable.delete_column(target_feature);
+    std::string targ_feat = tab.itable.delete_column(target_feature);
 
     tab.otable.set_label(targ_feat);
     tab.otable.set_type(targ_type);
@@ -860,8 +860,8 @@ istream& istreamTable_OLD(istream& in, Table& tab,
  * Warning: only works on dense data with header file.
  */
 istream& istreamTable_ignore_indices(istream& in, Table& tab,
-                                     const string& target_feature,
-                                     const vector<unsigned>& ignore_indices)
+                                     const std::string& target_feature,
+                                     const std::vector<unsigned>& ignore_indices)
 {
     istreamITable_ignore_indices(in, tab.itable, ignore_indices);
 
@@ -874,7 +874,7 @@ istream& istreamTable_ignore_indices(istream& in, Table& tab,
 
     type_node targ_type = tab.itable.get_type(target_feature);
 
-    string targ_feat = tab.itable.delete_column(target_feature);
+    std::string targ_feat = tab.itable.delete_column(target_feature);
 
     tab.otable.set_label(targ_feat);
     tab.otable.set_type(targ_type);
@@ -885,8 +885,8 @@ istream& istreamTable_ignore_indices(istream& in, Table& tab,
 // ==================================================================
 
 static istream&
-inferTableAttributes(istream& in, const string& target_feature,
-                     const string& timestamp_feature,
+inferTableAttributes(istream& in, const std::string& target_feature,
+                     const std::string& timestamp_feature,
                      const vector<string>& ignore_features,
                      type_tree& tt, bool& has_header, bool& is_sparse)
 {
@@ -898,11 +898,11 @@ inferTableAttributes(istream& in, const string& target_feature,
     // Get a portion of the dataset into memory (cleaning weird stuff)
     std::vector<string> lines;
     {
-        string line;
+        std::string line;
         is_sparse = false;
         while (get_data_line(in, line) && maxline-- > 0) {
             // It is sparse
-            is_sparse = is_sparse || string::npos != line.find(sparse_delim);
+            is_sparse = is_sparse || std::string::npos != line.find(sparse_delim);
             if (is_sparse) { // just get out
                 // TODO could be simplified, optimized, etc
                 in.seekg(beg);
@@ -1013,9 +1013,9 @@ inferTableAttributes(istream& in, const string& target_feature,
  * 2) Load the actual data.
  */
 istream& istreamTable(istream& in, Table& tab,
-                      const string& target_feature,
-                      const string& timestamp_feature,
-                      const vector<string>& ignore_features)
+                      const std::string& target_feature,
+                      const std::string& timestamp_feature,
+                      const std::vector<std::string>& ignore_features)
 {
     // Infer the properties of the table without loading its content
     type_tree tt;
@@ -1075,7 +1075,7 @@ istreamDenseTable_noHeader(istream& in, Table& tab,
                            const type_tree& tt, bool has_header)
 {
     // Get the entire dataset into memory (cleaning weird stuff)
-    string line;
+    std::string line;
     std::vector<string> lines;
     while (get_data_line(in, line))
         lines.push_back(line);
@@ -1107,14 +1107,14 @@ istreamDenseTable_noHeader(istream& in, Table& tab,
             tab.itable[i] = ftv(std::get<0>(tokenIOT));
 
             // Fill output
-            string output_str = std::get<1>(tokenIOT);
+            std::string output_str = std::get<1>(tokenIOT);
             // If there is no valid target index, then there is no
             // "output" column!
             if (""  != output_str)
                 tab.otable[i] = token_to_vertex(otype, output_str);
 
             // Fill date
-            string date_str = std::get<2>(tokenIOT);
+            std::string date_str = std::get<2>(tokenIOT);
             // If there is no valid timestamp index, then there is no
             // "output" column!
             if (""  != date_str)
@@ -1145,8 +1145,8 @@ istreamDenseTable_noHeader(istream& in, Table& tab,
 }
 
 istream& istreamDenseTable(istream& in, Table& tab,
-                           const string& target_feature,
-                           const string& timestamp_feature,
+                           const std::string& target_feature,
+                           const std::string& timestamp_feature,
                            const vector<string>& ignore_features,
                            const type_tree& tt, bool has_header)
 {
@@ -1165,7 +1165,7 @@ istream& istreamDenseTable(istream& in, Table& tab,
     int timestamp_idx = -1;     // disabled by default
     vector<unsigned> ignore_idxs;
     if (has_header) {
-        string line;
+        std::string line;
         get_data_line(in, line);
         vector<string> header = tokenizeRow<string>(line);
 
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index 46c5f72c7b..bf428324e7 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -102,6 +102,15 @@ static std::vector<T> tokenizeRow (
     return res;
 }
 
+// ===========================================================
+
+// TODO Should this be a TableValue?
+class Table : public std::vector<ValuePtr>
+{
+	public:
+		Table(void);
+};
+
 // ===========================================================
 typedef std::vector<std::string> string_seq;
 
@@ -109,10 +118,10 @@ typedef std::vector<std::string> string_seq;
 string_seq get_header(const std::string& input_file);
 
 std::istream& istreamRawITable(
-    std::istream& in, ITable& tab,
+    std::istream& in, Table& tab,
     const std::vector<unsigned>& ignored_indices=std::vector<unsigned>());
 
-std::istream& istreamITable(std::istream& in, ITable& tab,
+std::istream& istreamITable(std::istream& in, Table& tab,
                            const string_seq& ignore_features);
 
 std::istream& istreamTable(std::istream& in, Table& tab,
@@ -120,11 +129,11 @@ std::istream& istreamTable(std::istream& in, Table& tab,
 
 // TODO: reimplement loadITable with the same model of loadTable and
 // remove loadITable_optimized
-ITable loadITable(
+Table loadITable(
     const std::string& file_name,
     const string_seq& ignore_features=string_seq());
 
-ITable loadITable_optimized(
+Table loadITable_optimized(
     const std::string& file_name,
     const string_seq& ignore_features=string_seq());
 
@@ -136,9 +145,9 @@ Table loadTable(
     const std::string& file_name,
     const string_seq& ignore_features=string_seq());
 
-std::istream& istreamDenseTable(std::istream& in, Table& tab,
+std::istream& istreamDenseTable(std::istream&, Table&,
                                 const string_seq& ignore_features,
-                                const type_tree& tt, bool has_header);
+                                const std::vector<Type>&, bool has_header);
 
 
 } // ~namespaces opencog

From 15a338e2545517e0ed946b812dcd4a1451c25ce9 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 13:45:26 +0300
Subject: [PATCH 10/56] More std namespace and atomese conversions

---
 opencog/persist/csv/table_read.cc | 148 +++++++++++++++---------------
 1 file changed, 75 insertions(+), 73 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index ea0d621e92..9f0f36c1e0 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -38,6 +38,7 @@
 #include <boost/spirit/include/phoenix_operator.hpp>
 
 #include <opencog/util/dorepeat.h>
+#include <opencog/util/exceptions.h>
 #include <opencog/util/iostreamContainer.h>
 #include <opencog/util/oc_omp.h>
 #include <opencog/util/comprehension.h>
@@ -45,10 +46,11 @@
 #include <opencog/atoms/value/BoolValue.h>
 #include <opencog/atoms/value/FloatValue.h>
 #include <opencog/atoms/value/StringValue.h>
+#include <opencog/atoms/value/VoidValue.h>
 
 #include "table_read.h"
 
-namespace opencog {
+using namespace opencog;
 
 using namespace boost;
 using namespace boost::phoenix;
@@ -58,26 +60,26 @@ using boost::phoenix::arg_names::arg1;
 
 bool checkCarriageReturn(std::istream& in)
 {
-    char next_c = in.get();
-    if (next_c == '\r') // DOS format
-        next_c = in.get();
-    if (next_c == '\n')
-        return true;
-    return false;
+	char next_c = in.get();
+	if (next_c == '\r') // DOS format
+		next_c = in.get();
+	if (next_c == '\n')
+		return true;
+	return false;
 }
 
 void removeCarriageReturn(std::string& str)
 {
-    size_t s = str.size();
-    if ((s > 0) && (str[s-1] == '\r'))
-        str.resize(s-1);
+	size_t s = str.size();
+	if ((s > 0) && (str[s-1] == '\r'))
+		str.resize(s-1);
 }
 
 //* Remove non-ascii characters at the bigining of the line, only.
 void removeNonASCII(std::string& str)
 {
-    while (str.size() && (unsigned char)str[0] > 127)
-        str = str.substr(1);
+	while (str.size() && (unsigned char)str[0] > 127)
+		str = str.substr(1);
 }
 
 // -------------------------------------------------------
@@ -86,13 +88,13 @@ void removeNonASCII(std::string& str)
 // of hash, bang or semicolon.
 bool is_comment(const char c)
 {
-    if ('#' == c) return true;
-    if (';' == c) return true;
-    if ('!' == c) return true;
-    if ('\n' == c) return true;
-    if ('\r' == c) return true;
-    if (0 == c) return true;
-    return false;
+	if ('#' == c) return true;
+	if (';' == c) return true;
+	if ('!' == c) return true;
+	if ('\n' == c) return true;
+	if ('\r' == c) return true;
+	if (0 == c) return true;
+	return false;
 }
 
 /// Get one line of actual data.
@@ -106,19 +108,19 @@ bool is_comment(const char c)
 //
 std::istream& get_data_line(std::istream& is, std::string& line)
 {
-    while (true)
-    {
-        getline(is, line);
-        if (!is) return is;
-        if (is_comment(line[0])) continue;
+	while (true)
+	{
+		getline(is, line);
+		if (!is) return is;
+		if (is_comment(line[0])) continue;
 
-        // Remove weird symbols at the start of the line (only).
-        removeNonASCII(line);
-        // Remove carriage return at end of line (for DOS files).
-        removeCarriageReturn(line);
+		// Remove weird symbols at the start of the line (only).
+		removeNonASCII(line);
+		// Remove carriage return at end of line (for DOS files).
+		removeCarriageReturn(line);
 
-        return is;
-    }
+		return is;
+	}
 }
 
 // -------------------------------------------------------
@@ -141,15 +143,15 @@ static const char *sparse_delim = " : ";
 static std::pair<std::string, std::string>
 parse_key_val(const std::string& chunk)
 {
-    std::pair<std::string, std::string> res;
-    size_t pos = chunk.find(sparse_delim);
-    if (std::string::npos == pos)
-        return res;
-    std::string key = chunk.substr(0, pos);
-    boost::trim(key);
-    std::string val = chunk.substr(pos + strlen(sparse_delim));
-    boost::trim(val);
-    return {key, val};
+	std::pair<std::string, std::string> res;
+	size_t pos = chunk.find(sparse_delim);
+	if (std::string::npos == pos)
+		return res;
+	std::string key = chunk.substr(0, pos);
+	boost::trim(key);
+	std::string val = chunk.substr(pos + strlen(sparse_delim));
+	boost::trim(val);
+	return {key, val};
 }
 
 /**
@@ -158,23 +160,23 @@ parse_key_val(const std::string& chunk)
  */
 table_tokenizer get_row_tokenizer(const std::string& line)
 {
-    typedef boost::escaped_list_separator<char> separator;
-    typedef boost::tokenizer<separator> tokenizer;
+	typedef boost::escaped_list_separator<char> separator;
+	typedef boost::tokenizer<separator> tokenizer;
 
-    // Tokenize line; currently, we allow tabs, commas, blanks.
-    static const separator sep("\\", ",\t ", "\"");
-    return tokenizer(line, sep);
+	// Tokenize line; currently, we allow tabs, commas, blanks.
+	static const separator sep("\\", ",\t ", "\"");
+	return tokenizer(line, sep);
 }
 
 // Same as above, but only allow commas as a column separator.
 table_tokenizer get_sparse_row_tokenizer(const std::string& line)
 {
-    typedef boost::escaped_list_separator<char> separator;
-    typedef boost::tokenizer<separator> tokenizer;
+	typedef boost::escaped_list_separator<char> separator;
+	typedef boost::tokenizer<separator> tokenizer;
 
-    // Tokenize line; currently, we allow tabs, commas, blanks.
-    static const separator sep("\\", ",", "\"");
-    return tokenizer(line, sep);
+	// Tokenize line; currently, we allow tabs, commas, blanks.
+	static const separator sep("\\", ",", "\"");
+	return tokenizer(line, sep);
 }
 
 /**
@@ -182,15 +184,15 @@ table_tokenizer get_sparse_row_tokenizer(const std::string& line)
  * Used by istreamTable. This will modify the line to remove leading
  * non-ASCII characters, as well as stripping of any carriage-returns.
  */
-vector<string> tokenizeSparseRow(const std::string& line)
+std::vector<std::string> tokenizeSparseRow(const std::string& line)
 {
-    table_tokenizer tok = get_sparse_row_tokenizer(line);
-    vector<string> res;
-    for (string t : tok) {
-        boost::trim(t);
-        res.push_back(t);
-    }
-    return res;
+	table_tokenizer tok = get_sparse_row_tokenizer(line);
+	std::vector<std::string> res;
+	for (std::string t : tok) {
+		boost::trim(t);
+		res.push_back(t);
+	}
+	return res;
 }
 
 // -------------------------------------------------------
@@ -198,11 +200,11 @@ vector<string> tokenizeSparseRow(const std::string& line)
  * Given an input string, guess the type of the string.
  * Inferable types are: boolean, contin and enum.
  */
-type_node infer_type_from_token(const std::string& token)
+Type infer_type_from_token(const std::string& token)
 {
     /* Prefered representation is T's and 0's, to maximize clarity,
-     * readability.  Numeric values are easily confused with contin
-     * type.
+     * readability.  Numeric values are easily confused with floating
+     * point type.
      */
     if (token == "0" ||
         token == "1" ||
@@ -210,20 +212,20 @@ type_node infer_type_from_token(const std::string& token)
         token == "F" ||
         token == "t" ||
         token == "f")
-        return id::boolean_type;
+        return BOOL_VALUE;
 
     // If it starts with an alphabetic character, assume its a string
     else if (isalpha(token[0]))
-        return id::enum_type;
+        return STRING_VALUE;
 
     // Hope that we can cast this to a float point number.
     else {
         try {
-            lexical_cast<contin_t>(token);
-            return id::contin_type;
+            boost::lexical_cast<double>(token);
+            return FLOAT_VALUE;
         }
         catch(...) {
-            return id::ill_formed_type;
+            return VOID_VALUE;
         }
     }
 }
@@ -234,13 +236,13 @@ type_node infer_type_from_token(const std::string& token)
  * Compare this to 'curr_guess', and upgrade the type inference
  * if it can be done consistently.
  */
-static type_node
-infer_type_from_token2(type_node curr_guess, const std::string& token)
+static Type
+infer_type_from_token2(Type curr_guess, const std::string& token)
 {
-    type_node tokt = infer_type_from_token(token);
+    Type tokt = infer_type_from_token(token);
 
     // First time, just go with the flow.
-    if (id::unknown_type == curr_guess)
+    if (VOID_VALUE == curr_guess)
         return tokt;
 
     // Yayy! its consistent!
@@ -248,17 +250,17 @@ infer_type_from_token2(type_node curr_guess, const std::string& token)
         return tokt;
 
     // If we saw 0,1 when expecting a contin, its a contin.
-    if ((id::contin_type == curr_guess) && (id::boolean_type == tokt))
+    if ((FLOAT_VALUE == curr_guess) && (BOOL_VALUE == tokt))
         return curr_guess;
 
     // If we thought its a boolean 0,1 it might be a contin.
-    if ((id::boolean_type == curr_guess) && (id::contin_type == tokt))
+    if ((BOOL_VALUE == curr_guess) && (FLOAT_VALUE == tokt))
         return tokt;
 
     // If we got to here, then there's some sort of unexpected
     // inconsistency in the column types; we've got to presume that
     // its just some crazy ascii string, i.e. enum_type.
-    return id::enum_type;
+    return STRING_VALUE;
 }
 
 /// cast string "token" to a vertex of type "tipe"
@@ -1218,4 +1220,4 @@ Table loadTable(const std::string& file_name,
     return res;
 }
 
-} // ~namespaces opencog
+// ==================================================================

From 4c5aac801ab997339e841a2b98914bd82cbdc56e Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 13:55:42 +0300
Subject: [PATCH 11/56] More namespace conversions

---
 opencog/persist/csv/table_read.cc | 157 ++++++++++++++++--------------
 opencog/persist/csv/table_read.h  |  19 ----
 2 files changed, 83 insertions(+), 93 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 9f0f36c1e0..08fd9435cd 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -58,7 +58,11 @@ using boost::phoenix::arg_names::arg1;
 
 // -------------------------------------------------------
 
-bool checkCarriageReturn(std::istream& in)
+/**
+ * Return true if the next chars in 'in' correspond to carriage return
+ * (support UNIX and DOS format) and advance in of the checked chars.
+ */
+static bool checkCarriageReturn(std::istream& in)
 {
 	char next_c = in.get();
 	if (next_c == '\r') // DOS format
@@ -68,15 +72,20 @@ bool checkCarriageReturn(std::istream& in)
 	return false;
 }
 
-void removeCarriageReturn(std::string& str)
+/**
+ * remove the carriage return (for DOS format)
+ */
+static void removeCarriageReturn(std::string& str)
 {
 	size_t s = str.size();
 	if ((s > 0) && (str[s-1] == '\r'))
 		str.resize(s-1);
 }
 
-//* Remove non-ascii characters at the bigining of the line, only.
-void removeNonASCII(std::string& str)
+/**
+ * remove non ASCII char at the begining of the string
+ */
+static void removeNonASCII(std::string& str)
 {
 	while (str.size() && (unsigned char)str[0] > 127)
 		str = str.substr(1);
@@ -86,7 +95,7 @@ void removeNonASCII(std::string& str)
 // Return true if the character is one of the standard comment
 // delimiters.  Here, we define a 'standard delimiter' as one
 // of hash, bang or semicolon.
-bool is_comment(const char c)
+static bool is_comment(const char c)
 {
 	if ('#' == c) return true;
 	if (';' == c) return true;
@@ -264,7 +273,7 @@ infer_type_from_token2(Type curr_guess, const std::string& token)
 }
 
 /// cast string "token" to a vertex of type "tipe"
-ValuePtr token_to_boolean(const std::string& token)
+static ValuePtr token_to_boolean(const std::string& token)
 {
 	if ("0" == token || "F" == token || "f" == token)
 		return createBoolValue(false);
@@ -272,21 +281,21 @@ ValuePtr token_to_boolean(const std::string& token)
 	if ("1" == token || "T" == token || "t" == token)
 		return createBoolValue(true);
 
-	throw RuntimeError(TRACE_INFO,
+	throw SyntaxException(TRACE_INFO,
 		"Expecting boolean value, got %s", token.c_str());
 }
 
-ValuePtr token_to_contin(const std::string& token)
+static ValuePtr token_to_contin(const std::string& token)
 {
 	try {
 		return createFloatValue(lexical_cast<double>(token));
 	} catch (boost::bad_lexical_cast&) {
-		throw RuntimeError(TRACE_INFO,
+		throw SyntaxException(TRACE_INFO,
 			"Could not cast %s to floating point", token.c_str());
 	}
 }
 
-ValuePtr token_to_vertex(Type tipe, const std::string& token)
+ValuePtr opencog::token_to_vertex(Type tipe, const std::string& token)
 {
 	if (BOOL_VALUE == tipe)
 		return token_to_boolean(token);
@@ -300,14 +309,14 @@ ValuePtr token_to_vertex(Type tipe, const std::string& token)
 		if (isalpha(token[0]))
 			return createStringValue(token);
 
-		throw RuntimeError(TRACE_INFO,
+		throw SyntaxException(TRACE_INFO,
 			"Enum type must begin with alphabetic char, but %s doesn't",
 			token.c_str());
 	}
 
-	stringstream ss;
-	ss << "Unable to convert token \"" << token << "\" to type=" << tipe << endl;
-	throw RuntimeError(TRACE_INFO, "%s", ss.str().c_str());
+	throw SyntaxException(TRACE_INFO,
+		"Unable to convert token \"%s\" to type=%d",
+		token.c_str(), tipe);
 }
 
 // ===========================================================
@@ -321,74 +330,74 @@ ValuePtr token_to_vertex(Type tipe, const std::string& token)
  * the appropriate type, and thunking for the header, and ignoring
  * certain features, must all be done as a separate step.
  */
-istream& istreamRawITable(istream& in, ITable& tab,
-                          const vector<unsigned>& ignored_indices)
+std::istream& istreamRawITable(std::istream& in, Table& tab,
+							   const std::vector<unsigned>& ignored_indices)
 {
-    streampos beg = in.tellg();
-
-    // Get the entire dataset into memory
-    std::string line;
-    std::vector<string> lines;
-
-    // Read first few by hand. The first might be labels, so we must
-    // get at least the second line. But the second line might have
-    // all default feature values (i.e. no colon), so get the third...
-    dorepeat(20) {
-        if (!get_data_line(in, line))
-            break;
-        // If it is a sparse file, we are outta here.
-        // Throw an std::exception, since we don't want to log this as an
-        // error (all the other exception types log to the log file).
-        if (string::npos != line.find (sparse_delim)) {
-            in.seekg(beg);
-            throw std::exception();
-        }
-        lines.push_back(line);
-    }
-
-    // Grab the rest of the file.
-    while (get_data_line(in, line))
-        lines.push_back(line);
+	std::streampos beg = in.tellg();
+
+	// Get the entire dataset into memory
+	std::string line;
+	std::vector<std::string> lines;
+
+	// Read first few by hand. The first might be labels, so we must
+	// get at least the second line. But the second line might have
+	// all default feature values (i.e. no colon), so get the third...
+	dorepeat(20) {
+		if (!get_data_line(in, line))
+			break;
+		// If it is a sparse file, we are outta here.
+		// Throw an std::exception, since we don't want to log this as an
+		// error (all the other exception types log to the log file).
+		if (string::npos != line.find (sparse_delim)) {
+			in.seekg(beg);
+			throw std::exception();
+		}
+		lines.push_back(line);
+	}
 
-    // Determine the arity from the first line.
-    vector<string> fl = tokenizeRow<string>(lines[0], ignored_indices);
-    arity_t arity = fl.size();
+	// Grab the rest of the file.
+	while (get_data_line(in, line))
+		lines.push_back(line);
 
-    std::atomic<int> arity_fail_row(-1);
-    auto parse_line = [&](size_t i)
-    {
-        // tokenize the line and fill the table with
-        tab[i] = tokenizeRow<string>(lines[i], ignored_indices);
+	// Determine the arity from the first line.
+	vector<string> fl = tokenizeRow<string>(lines[0], ignored_indices);
+	arity_t arity = fl.size();
 
-        // Check arity
-        if (arity != (arity_t)tab[i].size())
-            arity_fail_row = i + 1;
-    };
-
-    // Vector of indices [0, lines.size())
-    size_t ls = lines.size();
-    tab.resize(ls);
-    auto ir = boost::irange((size_t)0, ls);
-    vector<size_t> indices(ir.begin(), ir.end());
-    OMP_ALGO::for_each(indices.begin(), indices.end(), parse_line);
-
-    if (-1 != arity_fail_row) {
-        in.seekg(beg);
-        OC_ASSERT(false,
-                  "ERROR: Input file inconsistent: the %uth row has "
-                  "a different number of columns than the rest of the file.  "
-                  "All rows should have the same number of columns.\n",
-                  arity_fail_row.load());
-    }
-    return in;
+	std::atomic<int> arity_fail_row(-1);
+	auto parse_line = [&](size_t i)
+	{
+		// tokenize the line and fill the table with
+		tab[i] = tokenizeRow<string>(lines[i], ignored_indices);
+
+		// Check arity
+		if (arity != (arity_t)tab[i].size())
+			arity_fail_row = i + 1;
+	};
+
+	// Vector of indices [0, lines.size())
+	size_t ls = lines.size();
+	tab.resize(ls);
+	auto ir = boost::irange((size_t)0, ls);
+	vector<size_t> indices(ir.begin(), ir.end());
+	OMP_ALGO::for_each(indices.begin(), indices.end(), parse_line);
+
+	if (-1 != arity_fail_row) {
+		in.seekg(beg);
+		OC_ASSERT(false,
+				  "ERROR: Input file inconsistent: the %uth row has "
+				  "a different number of columns than the rest of the file.  "
+				  "All rows should have the same number of columns.\n",
+				  arity_fail_row.load());
+	}
+	return in;
 }
 
 std::vector<std::string> get_header(const std::string& file_name)
 {
-    std::ifstream in(file_name.c_str());
-    std::string line;
-    get_data_line(in, line);
-    return tokenizeRow<std::string>(line);
+	std::ifstream in(file_name.c_str());
+	std::string line;
+	get_data_line(in, line);
+	return tokenizeRow<std::string>(line);
 }
 
 // ===========================================================
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index bf428324e7..4011876588 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -41,30 +41,11 @@
 
 namespace opencog {
 
-/**
- * remove the carriage return (for DOS format)
- */
-void removeCarriageReturn(std::string& str);
-
-/**
- * remove non ASCII char at the begining of the string
- */
-void removeNonASCII(std::string& str);
-
-/**
- * Return true if the next chars in 'in' correspond to carriage return
- * (support UNIX and DOS format) and advance in of the checked chars.
- */
-bool checkCarriageReturn(std::istream& in);
-
 /**
  * Convert strings to typed values
  */
-ValuePtr token_to_boolean(const std::string&);
-ValuePtr token_to_contin(const std::string&);
 ValuePtr token_to_vertex(Type, const std::string&);
 
-
 // ===========================================================
 
 typedef boost::tokenizer<boost::escaped_list_separator<char>> table_tokenizer;

From a8e17056453e25f20f5373b05293f62a0194f480 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 14:10:47 +0300
Subject: [PATCH 12/56] More conversions

---
 opencog/persist/csv/table_read.cc | 23 +++++++++++++----------
 opencog/persist/csv/table_read.h  |  2 +-
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 08fd9435cd..5ea9d1236e 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -330,8 +330,8 @@ ValuePtr opencog::token_to_vertex(Type tipe, const std::string& token)
  * the appropriate type, and thunking for the header, and ignoring
  * certain features, must all be done as a separate step.
  */
-std::istream& istreamRawITable(std::istream& in, Table& tab,
-							   const std::vector<unsigned>& ignored_indices)
+std::istream& istreamRawITable(std::istream& in, std::vector<string_seq>& tab,
+                               const std::vector<unsigned>& ignored_indices)
 {
 	std::streampos beg = in.tellg();
 
@@ -342,13 +342,16 @@ std::istream& istreamRawITable(std::istream& in, Table& tab,
 	// Read first few by hand. The first might be labels, so we must
 	// get at least the second line. But the second line might have
 	// all default feature values (i.e. no colon), so get the third...
-	dorepeat(20) {
+	dorepeat(20)
+	{
 		if (!get_data_line(in, line))
 			break;
+
 		// If it is a sparse file, we are outta here.
 		// Throw an std::exception, since we don't want to log this as an
 		// error (all the other exception types log to the log file).
-		if (string::npos != line.find (sparse_delim)) {
+		if (std::string::npos != line.find (sparse_delim))
+		{
 			in.seekg(beg);
 			throw std::exception();
 		}
@@ -360,17 +363,17 @@ std::istream& istreamRawITable(std::istream& in, Table& tab,
 		lines.push_back(line);
 
 	// Determine the arity from the first line.
-	vector<string> fl = tokenizeRow<string>(lines[0], ignored_indices);
-	arity_t arity = fl.size();
+	std::vector<std::string> fl = tokenizeRow<std::string>(lines[0], ignored_indices);
+	size_t arity = fl.size();
 
 	std::atomic<int> arity_fail_row(-1);
 	auto parse_line = [&](size_t i)
 	{
 		// tokenize the line and fill the table with
-		tab[i] = tokenizeRow<string>(lines[i], ignored_indices);
+		tab[i] = tokenizeRow<std::string>(lines[i], ignored_indices);
 
 		// Check arity
-		if (arity != (arity_t)tab[i].size())
+		if (arity != tab[i].size())
 			arity_fail_row = i + 1;
 	};
 
@@ -378,12 +381,12 @@ std::istream& istreamRawITable(std::istream& in, Table& tab,
 	size_t ls = lines.size();
 	tab.resize(ls);
 	auto ir = boost::irange((size_t)0, ls);
-	vector<size_t> indices(ir.begin(), ir.end());
+	std::vector<size_t> indices(ir.begin(), ir.end());
 	OMP_ALGO::for_each(indices.begin(), indices.end(), parse_line);
 
 	if (-1 != arity_fail_row) {
 		in.seekg(beg);
-		OC_ASSERT(false,
+		throw SyntaxException(TRACE_INFO,
 				  "ERROR: Input file inconsistent: the %uth row has "
 				  "a different number of columns than the rest of the file.  "
 				  "All rows should have the same number of columns.\n",
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index 4011876588..25b3aa5251 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -99,7 +99,7 @@ typedef std::vector<std::string> string_seq;
 string_seq get_header(const std::string& input_file);
 
 std::istream& istreamRawITable(
-    std::istream& in, Table& tab,
+    std::istream& in, std::vector<string_seq>& table,
     const std::vector<unsigned>& ignored_indices=std::vector<unsigned>());
 
 std::istream& istreamITable(std::istream& in, Table& tab,

From cf8743bf7400653a8e3256330a8207224454211e Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 14:15:10 +0300
Subject: [PATCH 13/56] White-space conversion

---
 opencog/persist/csv/table_read.cc | 168 +++++++++++++++---------------
 1 file changed, 85 insertions(+), 83 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 5ea9d1236e..3c8ddf1bd2 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -409,37 +409,39 @@ std::vector<std::string> get_header(const std::string& file_name)
  * into a multi_type_seq containing the typed values given the input
  * type signature.
  */
-struct from_tokens_visitor : public boost::static_visitor<multi_type_seq>
+struct from_tokens_visitor : public boost::static_visitor<ValuePtr>
 {
-    from_tokens_visitor(const type_node_seq& types) : _types(types) {
-        all_boolean = boost::count(types, id::boolean_type) == (int)types.size();
-        all_contin = boost::count(types, id::contin_type) == (int)types.size();
-    }
-    result_type operator()(const string_seq& seq) {
-        result_type res;
-        if (all_boolean) {
-            res = builtin_seq();
-            builtin_seq& bs = res.get_seq<builtin>();
-            boost::transform(seq, back_inserter(bs), token_to_boolean);
-        }
-        else if (all_contin) {
-            res = contin_seq();
-            contin_seq& cs = res.get_seq<contin_t>();
-            boost::transform(seq, back_inserter(cs), token_to_contin);
-        }
-        else {
-            res = vertex_seq();
-            vertex_seq& vs = res.get_seq<vertex>();
-            boost::transform(_types, seq, back_inserter(vs), token_to_vertex);
-        }
-        return res;
-    }
-    template<typename Seq> result_type operator()(const Seq& seq) {
-        OC_ASSERT(false, "You are not supposed to do that");
-        return result_type();
-    }
-    const type_node_seq& _types;
-    bool all_boolean, all_contin;
+	from_tokens_visitor(const std::vector<Type>& types) : _types(types)
+	{
+		all_boolean = boost::count(types, BOOL_VALUE) == (int)types.size();
+		all_contin = boost::count(types, FLOAT_VALUE) == (int)types.size();
+	}
+	result_type operator()(const string_seq& seq)
+	{
+		result_type res;
+		if (all_boolean) {
+			res = builtin_seq();
+			builtin_seq& bs = res.get_seq<builtin>();
+			boost::transform(seq, back_inserter(bs), token_to_boolean);
+		}
+		else if (all_contin) {
+			res = contin_seq();
+			contin_seq& cs = res.get_seq<contin_t>();
+			boost::transform(seq, back_inserter(cs), token_to_contin);
+		}
+		else {
+			res = vertex_seq();
+			vertex_seq& vs = res.get_seq<vertex>();
+			boost::transform(_types, seq, back_inserter(vs), token_to_vertex);
+		}
+		return res;
+	}
+	template<typename Seq> result_type operator()(const Seq& seq) {
+		OC_ASSERT(false, "You are not supposed to do that");
+		return result_type();
+	}
+	const type_node_seq& _types;
+	bool all_boolean, all_contin;
 };
 
 
@@ -448,59 +450,59 @@ struct from_tokens_visitor : public boost::static_visitor<multi_type_seq>
  */
 struct from_sparse_tokens_visitor : public from_tokens_visitor
 {
-    from_sparse_tokens_visitor(const type_node_seq& types,
-                               const std::map<const std::string, size_t>& index,
-                               size_t fixed_arity)
-        : from_tokens_visitor(types), _index(index), _fixed_arity(fixed_arity) {}
-    result_type operator()(const string_seq& seq) {
-        using std::transform;
-        using std::for_each;
-        result_type res;
-        if (all_boolean) {
-            res = builtin_seq(_types.size(), id::logical_false);
-            builtin_seq& bs = res.get_seq<builtin>();
-            auto begin_sparse = seq.begin() + _fixed_arity;
-            transform(seq.begin(), begin_sparse, bs.begin(), token_to_boolean);
-            for (auto it = begin_sparse; it != seq.end(); ++it) {
-                auto key_val = parse_key_val(*it);
-                if (key_val != std::pair<std::string, std::string>()) {
-                    size_t idx = _index.at(key_val.first);
-                    bs[idx] = token_to_boolean(key_val.second);
-                }
-            }
-        }
-        else if (all_contin) {
-            res = contin_seq(_types.size(), 0.0);
-            contin_seq& cs = res.get_seq<contin_t>();
-            auto begin_sparse = seq.cbegin() + _fixed_arity;
-            transform(seq.begin(), begin_sparse, cs.begin(), token_to_contin);
-            for (auto it = begin_sparse; it != seq.end(); ++it) {
-                auto key_val = parse_key_val(*it);
-                if (key_val != std::pair<std::string, std::string>()) {
-                    size_t idx = _index.at(key_val.first);
-                    cs[idx] = token_to_contin(key_val.second);
-                }
-            }
-        }
-        else {
-            res = vertex_seq(_types.size());
-            vertex_seq& vs = res.get_seq<vertex>();
-            auto begin_sparse_types = _types.cbegin() + _fixed_arity;
-            auto begin_sparse_seq = seq.cbegin() + _fixed_arity;
-            transform(_types.begin(), begin_sparse_types,
-                      seq.begin(), vs.begin(), token_to_vertex);
-            for (auto it = begin_sparse_seq; it != seq.end(); ++it) {
-                auto key_val = parse_key_val(*it);
-                if (key_val != std::pair<std::string, std::string>()) {
-                    size_t idx = _index.at(key_val.first);
-                    vs[idx] = token_to_vertex(_types[idx], key_val.second);
-                }
-            }
-        }
-        return res;
-    }
-    std::map<const std::string, size_t> _index;
-    size_t _fixed_arity;
+	from_sparse_tokens_visitor(const type_node_seq& types,
+							   const std::map<const std::string, size_t>& index,
+							   size_t fixed_arity)
+		: from_tokens_visitor(types), _index(index), _fixed_arity(fixed_arity) {}
+	result_type operator()(const string_seq& seq) {
+		using std::transform;
+		using std::for_each;
+		result_type res;
+		if (all_boolean) {
+			res = builtin_seq(_types.size(), id::logical_false);
+			builtin_seq& bs = res.get_seq<builtin>();
+			auto begin_sparse = seq.begin() + _fixed_arity;
+			transform(seq.begin(), begin_sparse, bs.begin(), token_to_boolean);
+			for (auto it = begin_sparse; it != seq.end(); ++it) {
+				auto key_val = parse_key_val(*it);
+				if (key_val != std::pair<std::string, std::string>()) {
+					size_t idx = _index.at(key_val.first);
+					bs[idx] = token_to_boolean(key_val.second);
+				}
+			}
+		}
+		else if (all_contin) {
+			res = contin_seq(_types.size(), 0.0);
+			contin_seq& cs = res.get_seq<contin_t>();
+			auto begin_sparse = seq.cbegin() + _fixed_arity;
+			transform(seq.begin(), begin_sparse, cs.begin(), token_to_contin);
+			for (auto it = begin_sparse; it != seq.end(); ++it) {
+				auto key_val = parse_key_val(*it);
+				if (key_val != std::pair<std::string, std::string>()) {
+					size_t idx = _index.at(key_val.first);
+					cs[idx] = token_to_contin(key_val.second);
+				}
+			}
+		}
+		else {
+			res = vertex_seq(_types.size());
+			vertex_seq& vs = res.get_seq<vertex>();
+			auto begin_sparse_types = _types.cbegin() + _fixed_arity;
+			auto begin_sparse_seq = seq.cbegin() + _fixed_arity;
+			transform(_types.begin(), begin_sparse_types,
+					  seq.begin(), vs.begin(), token_to_vertex);
+			for (auto it = begin_sparse_seq; it != seq.end(); ++it) {
+				auto key_val = parse_key_val(*it);
+				if (key_val != std::pair<std::string, std::string>()) {
+					size_t idx = _index.at(key_val.first);
+					vs[idx] = token_to_vertex(_types[idx], key_val.second);
+				}
+			}
+		}
+		return res;
+	}
+	std::map<const std::string, size_t> _index;
+	size_t _fixed_arity;
 };
 
 

From 2556dbd74cb67ccc3971af94c7e70b8659ed3c7a Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 15:14:38 +0300
Subject: [PATCH 14/56] Ongoing conversion efforts

---
 opencog/atoms/value/README.md     |   6 +
 opencog/persist/csv/table_read.cc | 246 +++++++++---------------------
 opencog/persist/csv/table_read.h  |  16 +-
 3 files changed, 89 insertions(+), 179 deletions(-)

diff --git a/opencog/atoms/value/README.md b/opencog/atoms/value/README.md
index 95c2eb6ff0..60662eb7a8 100644
--- a/opencog/atoms/value/README.md
+++ b/opencog/atoms/value/README.md
@@ -94,4 +94,10 @@ Adding New Atom and Value Types
 Please see the
 [README-Adding-New-Atom-Types.md](../atom_types/README-Adding-New-Atom-Types.md) file.
 
+See also the [Custom Types Example](../../../examples/type-system/README.md)
 
+TODO
+----
+* Perhaps add a TypeValue, which would be a vector of Types.  If could
+  be useful as a kind-of table signature (for the csv table handling
+  code).
diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 3c8ddf1bd2..22854331bc 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -330,7 +330,7 @@ ValuePtr opencog::token_to_vertex(Type tipe, const std::string& token)
  * the appropriate type, and thunking for the header, and ignoring
  * certain features, must all be done as a separate step.
  */
-std::istream& istreamRawITable(std::istream& in, std::vector<string_seq>& tab,
+std::istream& istreamRawITable(std::istream& in, ITable& tab,
                                const std::vector<unsigned>& ignored_indices)
 {
 	std::streampos beg = in.tellg();
@@ -395,118 +395,9 @@ std::istream& istreamRawITable(std::istream& in, std::vector<string_seq>& tab,
 	return in;
 }
 
-std::vector<std::string> get_header(const std::string& file_name)
-{
-	std::ifstream in(file_name.c_str());
-	std::string line;
-	get_data_line(in, line);
-	return tokenizeRow<std::string>(line);
-}
-
 // ===========================================================
-/**
- * Visitor to parse a list of strings (buried in a multi_type_seq)
- * into a multi_type_seq containing the typed values given the input
- * type signature.
- */
-struct from_tokens_visitor : public boost::static_visitor<ValuePtr>
-{
-	from_tokens_visitor(const std::vector<Type>& types) : _types(types)
-	{
-		all_boolean = boost::count(types, BOOL_VALUE) == (int)types.size();
-		all_contin = boost::count(types, FLOAT_VALUE) == (int)types.size();
-	}
-	result_type operator()(const string_seq& seq)
-	{
-		result_type res;
-		if (all_boolean) {
-			res = builtin_seq();
-			builtin_seq& bs = res.get_seq<builtin>();
-			boost::transform(seq, back_inserter(bs), token_to_boolean);
-		}
-		else if (all_contin) {
-			res = contin_seq();
-			contin_seq& cs = res.get_seq<contin_t>();
-			boost::transform(seq, back_inserter(cs), token_to_contin);
-		}
-		else {
-			res = vertex_seq();
-			vertex_seq& vs = res.get_seq<vertex>();
-			boost::transform(_types, seq, back_inserter(vs), token_to_vertex);
-		}
-		return res;
-	}
-	template<typename Seq> result_type operator()(const Seq& seq) {
-		OC_ASSERT(false, "You are not supposed to do that");
-		return result_type();
-	}
-	const type_node_seq& _types;
-	bool all_boolean, all_contin;
-};
-
 
-/**
- * The class below tokenizes one row, and jams it into the table
- */
-struct from_sparse_tokens_visitor : public from_tokens_visitor
-{
-	from_sparse_tokens_visitor(const type_node_seq& types,
-							   const std::map<const std::string, size_t>& index,
-							   size_t fixed_arity)
-		: from_tokens_visitor(types), _index(index), _fixed_arity(fixed_arity) {}
-	result_type operator()(const string_seq& seq) {
-		using std::transform;
-		using std::for_each;
-		result_type res;
-		if (all_boolean) {
-			res = builtin_seq(_types.size(), id::logical_false);
-			builtin_seq& bs = res.get_seq<builtin>();
-			auto begin_sparse = seq.begin() + _fixed_arity;
-			transform(seq.begin(), begin_sparse, bs.begin(), token_to_boolean);
-			for (auto it = begin_sparse; it != seq.end(); ++it) {
-				auto key_val = parse_key_val(*it);
-				if (key_val != std::pair<std::string, std::string>()) {
-					size_t idx = _index.at(key_val.first);
-					bs[idx] = token_to_boolean(key_val.second);
-				}
-			}
-		}
-		else if (all_contin) {
-			res = contin_seq(_types.size(), 0.0);
-			contin_seq& cs = res.get_seq<contin_t>();
-			auto begin_sparse = seq.cbegin() + _fixed_arity;
-			transform(seq.begin(), begin_sparse, cs.begin(), token_to_contin);
-			for (auto it = begin_sparse; it != seq.end(); ++it) {
-				auto key_val = parse_key_val(*it);
-				if (key_val != std::pair<std::string, std::string>()) {
-					size_t idx = _index.at(key_val.first);
-					cs[idx] = token_to_contin(key_val.second);
-				}
-			}
-		}
-		else {
-			res = vertex_seq(_types.size());
-			vertex_seq& vs = res.get_seq<vertex>();
-			auto begin_sparse_types = _types.cbegin() + _fixed_arity;
-			auto begin_sparse_seq = seq.cbegin() + _fixed_arity;
-			transform(_types.begin(), begin_sparse_types,
-					  seq.begin(), vs.begin(), token_to_vertex);
-			for (auto it = begin_sparse_seq; it != seq.end(); ++it) {
-				auto key_val = parse_key_val(*it);
-				if (key_val != std::pair<std::string, std::string>()) {
-					size_t idx = _index.at(key_val.first);
-					vs[idx] = token_to_vertex(_types[idx], key_val.second);
-				}
-			}
-		}
-		return res;
-	}
-	std::map<const std::string, size_t> _index;
-	size_t _fixed_arity;
-};
-
-
-// ===========================================================
+#if NOT_RIGHT_NOW
 /**
  * Fill the input table, given a file in 'sparse' format.
  *
@@ -630,29 +521,32 @@ istream& istreamSparseITable(istream& in, ITable& tab)
 
     return in;
 }
+#endif
 
 /**
  * Infer the column types of the input table. It is assumed the
  * table's rows are vector of strings.
  */
-type_node_seq infer_column_types(const ITable& tab)
+std::vector<Type> infer_column_types(const std::vector<string_seq>& tab)
 {
-    vector<multi_type_seq>::const_iterator rowit = tab.begin();
+	std::vector<string_seq>::const_iterator rowit = tab.begin();
 
-    arity_t arity = rowit->size();
-    type_node_seq types(arity, id::unknown_type);
+	size_t arity = rowit->size();
+	std::vector<Type> types(arity, VOID_VALUE);
 
-    // Skip the first line, it might be a header...
-    // and that would confuse type inference.
-    if (tab.size() > 1)
-        ++rowit;
-    for (; rowit != tab.end(); ++rowit)
-    {
-        const string_seq& tokens = rowit->get_seq<string>();
-        for (arity_t i=0; i<arity; i++)
-            types[i] = infer_type_from_token2(types[i], tokens[i]);
-    }
-    return types;
+	// Skip the first line, it might be a header...
+	// and that would confuse type inference.
+	if (tab.size() > 1)
+		++rowit;
+
+	// Loop over all rows; this performs a consistency check.
+	for (; rowit != tab.end(); ++rowit)
+	{
+		const string_seq& tokens = *rowit;
+		for (size_t i=0; i<arity; i++)
+			types[i] = infer_type_from_token2(types[i], tokens[i]);
+	}
+	return types;
 }
 
 /**
@@ -661,18 +555,19 @@ type_node_seq infer_column_types(const ITable& tab)
  * then the first row must be a header, i.e. a set of ascii column
  * labels.
  */
-bool has_header(ITable& tab, type_node_seq col_types)
+static bool has_header(ITable& tab, const std::vector<Type>& col_types)
 {
-    const string_seq& row = tab.begin()->get_seq<string>();
+	const string_seq& row = *tab.begin();
 
-    arity_t arity = row.size();
+	size_t arity = row.size();
 
-    for (arity_t i=0; i<arity; i++) {
-        type_node flt = infer_type_from_token2(col_types[i], row[i]);
-        if ((id::enum_type == flt) && (id::enum_type != col_types[i]))
-            return true;
-    }
-    return false;
+	for (size_t i=0; i<arity; i++)
+	{
+		Type flt = infer_type_from_token2(col_types[i], row[i]);
+		if ((FLOAT_VALUE == flt) && (STRING_VALUE != col_types[i]))
+			return true;
+	}
+	return false;
 }
 
 /**
@@ -680,14 +575,23 @@ bool has_header(ITable& tab, type_node_seq col_types)
  * types.  If there is a mis-match, then it must be a header, i.e. a
  * set of ascii column labels.
  */
-bool is_header(const vector<string>& tokens, const type_node_seq& col_types)
+bool is_header(const string_seq& tokens, const std::vector<Type>& col_types)
 {
-    for (size_t i = 0; i < tokens.size(); i++) {
-        type_node flt = infer_type_from_token2(col_types[i], tokens[i]);
-        if ((id::enum_type == flt) && (id::enum_type != col_types[i]))
-            return true;
-    }
-    return false;
+	for (size_t i = 0; i < tokens.size(); i++)
+	{
+		Type flt = infer_type_from_token2(col_types[i], tokens[i]);
+		if ((STRING_VALUE == flt) && (STRING_VALUE != col_types[i]))
+			return true;
+	}
+	return false;
+}
+
+std::vector<std::string> get_header(const std::string& file_name)
+{
+	std::ifstream in(file_name.c_str());
+	std::string line;
+	get_data_line(in, line);
+	return tokenizeRow<std::string>(line);
 }
 
 /**
@@ -699,41 +603,41 @@ bool is_header(const vector<string>& tokens, const type_node_seq& col_types)
  * infer the column types, and the presence of a header.
  */
 istream& istreamITable(istream& in, ITable& tab,
-                       const vector<string>& ignore_features)
+                       const std::vector<std::string>& ignore_features)
 {
-    try {
-        istreamRawITable(in, tab);
-    }
-    catch (std::exception& e) {
-        istreamSparseITable(in, tab);
-        // Get rid of the unwanted columns.
-        tab.delete_columns(ignore_features);
-        return in;
-    }
+	istreamRawITable(in, tab);
+	try {
+	}
+	catch (std::exception& e) {
+		istreamSparseITable(in, tab);
+		// Get rid of the unwanted columns.
+		tab.delete_columns(ignore_features);
+		return in;
+	}
 
-    // Determine the column types.
-    type_node_seq col_types = infer_column_types(tab);
-    tab.set_types(col_types);
+	// Determine the column types.
+	type_node_seq col_types = infer_column_types(tab);
+	tab.set_types(col_types);
 
-    // If there is a header row, then it must be the column labels.
-    if (has_header(tab, col_types)) {
-        tab.set_labels(tab.begin()->get_seq<string>());
-        tab.erase(tab.begin());
-    }
+	// If there is a header row, then it must be the column labels.
+	if (has_header(tab, col_types)) {
+		tab.set_labels(tab.begin()->get_seq<string>());
+		tab.erase(tab.begin());
+	}
 
-    // Now that we have some column labels to work off of,
-    // Get rid of the unwanted columns.
-    tab.delete_columns(ignore_features);
+	// Now that we have some column labels to work off of,
+	// Get rid of the unwanted columns.
+	tab.delete_columns(ignore_features);
 
-    // Finally, perform a column type conversion
-    from_tokens_visitor ftv(tab.get_types());
-    auto aft = apply_visitor(ftv);
-    OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(),
-                        [&](multi_type_seq& seq) {
-                            return aft(seq.get_variant());
-                        });
+	// Finally, perform a column type conversion
+	from_tokens_visitor ftv(tab.get_types());
+	auto aft = apply_visitor(ftv);
+	OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(),
+						[&](multi_type_seq& seq) {
+							return aft(seq.get_variant());
+						});
 
-    return in;
+	return in;
 }
 
 /**
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index 25b3aa5251..4aa885b14d 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -85,24 +85,24 @@ static std::vector<T> tokenizeRow (
 
 // ===========================================================
 
+// TODO: Should this be a StringValue?
+typedef std::vector<std::string> string_seq;
+
+typedef std::vector<string_seq> ITable;
+
 // TODO Should this be a TableValue?
-class Table : public std::vector<ValuePtr>
-{
-	public:
-		Table(void);
-};
+typedef std::vector<ValuePtr> Table;
 
 // ===========================================================
-typedef std::vector<std::string> string_seq;
 
 // Get the header of a DSV file (assuming there is one)
 string_seq get_header(const std::string& input_file);
 
 std::istream& istreamRawITable(
-    std::istream& in, std::vector<string_seq>& table,
+    std::istream& in, ITable& table,
     const std::vector<unsigned>& ignored_indices=std::vector<unsigned>());
 
-std::istream& istreamITable(std::istream& in, Table& tab,
+std::istream& istreamITable(std::istream& in, ITable& tab,
                            const string_seq& ignore_features);
 
 std::istream& istreamTable(std::istream& in, Table& tab,

From 4709381d54747d703ae818c65e0726306fe974e3 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 15:34:10 +0300
Subject: [PATCH 15/56] More conversions

---
 opencog/persist/csv/table_read.cc | 148 ++----------------------------
 1 file changed, 7 insertions(+), 141 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 22854331bc..9d6ee86369 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -397,132 +397,6 @@ std::istream& istreamRawITable(std::istream& in, ITable& tab,
 
 // ===========================================================
 
-#if NOT_RIGHT_NOW
-/**
- * Fill the input table, given a file in 'sparse' format.
- *
- * The sparse table format consists of some fixed number of columns,
- * in comma-separated format, followed by key-value pairs, also
- * tab-separated. viz:
- *
- *     val, val, val, name:val, name:val, name:val
- *
- * Thus, for example, a row such as
- *
- *    earn, issued : 1, results : 2, ending : 1, including : 1
- *
- * indicates that there one fixed column, of enum type, (the enum value
- * being "earn"), and that features called "issued", "ending" and
- * "including" have a contin value of 1.0  and "results" has a contin
- * value of 2.
- *
- * The routine does NOT store the table in sparse format: it stores the
- * full, exploded table. This could be bad ...
- * TODO: we really need a sparse table format, as well.
- *
- * The "Raw" format has all data as strings; type conversion to the
- * appropriate type, must all be done as a separate step.
- */
-istream& istreamSparseITable(istream& in, ITable& tab)
-{
-    // The raw dataset
-    std::vector<string> lines;
-
-    // The first non-comment line is assumed to be the header.
-    // ... unless it isn't. (The header must not contain a colon).
-    vector<string> labs;
-    size_t fixed_arity = 0;
-    std::string header;
-    get_data_line(in, header);
-    if (string::npos == header.find(sparse_delim)) {
-        // Determine the arity of the fixed columns
-        vector<string> hdr = tokenizeSparseRow(header);
-        fixed_arity = hdr.size();
-        labs = hdr;
-    }
-    else {
-        lines.push_back(header);
-    }
-
-    // Get the entire dataset into memory
-    std::string iline;
-    while (get_data_line(in, iline))
-        lines.push_back(iline);
-
-    if (0 == fixed_arity) {
-        vector<string> fixy = tokenizeSparseRow(lines[0]);
-        // count commas, until a semi-colon is found.
-        while (string::npos == fixy[fixed_arity].find(sparse_delim))
-            fixed_arity++;
-    }
-    logger().info() << "Sparse file fixed column count=" << fixed_arity;
-
-    // Get a list of all of the features.
-    set<string> feats;
-    // All sparse features have the same type.
-    type_node feat_type = id::unknown_type;
-
-    // Fixed features may have different types, by column.
-    type_node_seq types(fixed_arity, id::unknown_type);
-
-    for (const std::string& line : lines) {
-        vector<string> chunks = tokenizeSparseRow(line);
-        vector<string>::const_iterator pit = chunks.begin();
-
-        // Infer the types of the fixed features.
-        size_t off = 0;
-        for (; off < fixed_arity; ++off, ++pit)
-            types[off] = infer_type_from_token2(types[off], *pit);
-
-        for (; pit != chunks.end(); ++pit) {
-            // Rip out the key-value pairs
-            auto key_val = parse_key_val(*pit);
-            if (key_val == pair<string, std::string>())
-                break;
-            // Store the key, uniquely.  Store best guess as the type.
-            feats.insert(key_val.first);
-            feat_type = infer_type_from_token2(feat_type, key_val.second);
-        }
-    }
-    logger().info() << "Sparse file unique features count=" << feats.size();
-    logger().info() << "Sparse file feature type=" << feat_type;
-    logger().info() << "Sparse file row count=" << lines.size();
-
-    // Convert the feature set into a list of labels.
-    // 'index' is a map from feature name to column number.
-    size_t cnt = fixed_arity;
-    std::map<const std::string, size_t> index;
-    for (const std::string& key : feats) {
-        types.push_back(feat_type);
-        labs.push_back(key);
-        index[key] = cnt;
-        cnt++;
-    }
-    tab.set_labels(labs);
-    tab.set_types(types);
-
-    // And finally, stuff up the table.
-    from_sparse_tokens_visitor fstv(types, index, fixed_arity);
-    auto fill_line = [&](int i)
-    {
-        const std::string& line = lines[i];
-        // Tokenize the line
-        vector<string> chunks = tokenizeSparseRow(line);
-        multi_type_seq row = fstv(chunks);
-        tab[i] = row;
-    };
-
-    // Vector of indices [0, lines.size())
-    size_t ls = lines.size();
-    tab.resize(ls);
-    auto ir = boost::irange((size_t)0, ls);
-    vector<size_t> indices(ir.begin(), ir.end());
-    OMP_ALGO::for_each(indices.begin(), indices.end(), fill_line);
-
-    return in;
-}
-#endif
-
 /**
  * Infer the column types of the input table. It is assumed the
  * table's rows are vector of strings.
@@ -602,32 +476,24 @@ std::vector<std::string> get_header(const std::string& file_name)
  * the entire table, as a collection of strings.  Next, it tries to
  * infer the column types, and the presence of a header.
  */
-istream& istreamITable(istream& in, ITable& tab,
-                       const std::vector<std::string>& ignore_features)
+std::istream& istreamITable(std::istream& in, ITable& tab,
+                            const std::vector<std::string>& ignore_features)
 {
 	istreamRawITable(in, tab);
-	try {
-	}
-	catch (std::exception& e) {
-		istreamSparseITable(in, tab);
-		// Get rid of the unwanted columns.
-		tab.delete_columns(ignore_features);
-		return in;
-	}
 
 	// Determine the column types.
-	type_node_seq col_types = infer_column_types(tab);
-	tab.set_types(col_types);
+	std::vector<Type> col_types = infer_column_types(tab);
 
 	// If there is a header row, then it must be the column labels.
-	if (has_header(tab, col_types)) {
-		tab.set_labels(tab.begin()->get_seq<string>());
+	if (has_header(tab, col_types))
+	{
+		// tab.set_labels(*tab.begin());
 		tab.erase(tab.begin());
 	}
 
 	// Now that we have some column labels to work off of,
 	// Get rid of the unwanted columns.
-	tab.delete_columns(ignore_features);
+	// tab.delete_columns(ignore_features);
 
 	// Finally, perform a column type conversion
 	from_tokens_visitor ftv(tab.get_types());

From 40648536da270884d3c998453aef17455911c623 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 15:41:55 +0300
Subject: [PATCH 16/56] Remove cruft

---
 opencog/persist/csv/table_read.cc | 173 +++---------------------------
 1 file changed, 16 insertions(+), 157 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 9d6ee86369..1c138d36bf 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -495,6 +495,12 @@ std::istream& istreamITable(std::istream& in, ITable& tab,
 	// Get rid of the unwanted columns.
 	// tab.delete_columns(ignore_features);
 
+	// determined ignore_indices
+	std::vector<unsigned> ignore_indices = get_indices(ignore_features,
+		  										  get_header(file_name));
+
+
+....
 	// Finally, perform a column type conversion
 	from_tokens_visitor ftv(tab.get_types());
 	auto aft = apply_visitor(ftv);
@@ -506,166 +512,19 @@ std::istream& istreamITable(std::istream& in, ITable& tab,
 	return in;
 }
 
-/**
- * Like istreamITable but add the option to ignore indices.
- *
- * It's akind of a temporary hack, till it's clear that this is much
- * faster and we should recode istreamITable to ignore features
- * head-on.
- *
- * Also, it assumes that the dataset is not sparse.
- */
-istream& istreamITable_ignore_indices(istream& in, ITable& tab,
-                                      const vector<unsigned>& ignore_indices)
-{
-    istreamRawITable(in, tab, ignore_indices);
-
-    // Determine the column types.
-    type_node_seq col_types = infer_column_types(tab);
-    tab.set_types(col_types);
-
-    // If there is a header row, then it must be the column labels.
-    if (has_header(tab, col_types)) {
-        tab.set_labels(tab.begin()->get_seq<string>());
-        tab.erase(tab.begin());
-    }
-
-    // Finally, perform a column type conversion
-    from_tokens_visitor ftv(tab.get_types());
-    auto aft = apply_visitor(ftv);
-    OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(),
-                        [&](multi_type_seq& seq) {
-                            return aft(seq.get_variant());
-                        });
-
-    return in;
-}
-
-/**
- * Take a line and return a triple with vector containing the input
- * elements, output element and timestamp.
- */
-std::tuple<std::vector<std::string>, std::string, std::string>
-tokenizeRowIOT(const std::string& line,
-               const std::vector<unsigned>& ignored_indices,
-               int target_idx,  // < 0 == ignored
-               int timestamp_idx) // < 0 == ignored
-{
-    std::tuple<std::vector<std::string>, std::string, std::string> res;
-    table_tokenizer toker = get_row_tokenizer(line);
-    int i = 0;
-    for (const std::string& tok : toker) {
-        if (!boost::binary_search(ignored_indices, i)) {
-            std::string el = boost::lexical_cast<string>(tok);
-            if (target_idx == i)
-                std::get<1>(res) = el;
-            else if (timestamp_idx == i)
-                std::get<2>(res) = el;
-            else
-                std::get<0>(res).push_back(el);
-        }
-        i++;
-    }
-    return res;
-}
-
 ITable loadITable(const std::string& file_name,
-                  const vector<string>& ignore_features)
+				  const std::vector<std::string>& ignore_features)
 {
-    OC_ASSERT(!file_name.empty(), "the file name is empty");
-    ifstream in(file_name.c_str());
-    OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str());
-
-    ITable res;
-    istreamITable(in, res, ignore_features);
-    return res;
-}
-
-/**
- * Like loadITable but it is optimized by ignoring features head-on
- * (rather than loading them, then removing them.
- *
- * WARNING: it assumes the dataset has a header!!!
- */
-ITable loadITable_optimized(const std::string& file_name,
-                            const vector<string>& ignore_features)
-{
-    OC_ASSERT(!file_name.empty(), "the file name is empty");
-    ifstream in(file_name.c_str());
-    OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str());
-
-    // determined ignore_indices
-    vector<unsigned> ignore_indices = get_indices(ignore_features,
-                                                  get_header(file_name));
-
-    ITable res;
-    istreamITable_ignore_indices(in, res, ignore_indices);
-    return res;
-}
-
-/**
- * Fill an input table and output table given a DSV
- * (delimiter-seperated values) file format, where delimiters are ',',
- * ' ' or '\t'.
- *
- * It is assumed that each row have the same number of columns, if not
- * an assert is raised.
- *
- * pos specifies the position of the output, if -1 it is the last
- * position. The default position is 0, the first column.
- *
- * This is only used for sparse table and could be optimized
- */
-istream& istreamTable_OLD(istream& in, Table& tab,
-                          const std::string& target_feature,
-                          const std::vector<std::string>& ignore_features)
-{
-    istreamITable(in, tab.itable, ignore_features);
-
-    tab.otable = tab.itable.get_column_data(target_feature);
-    OC_ASSERT(0 != tab.otable.size(),
-              "Fatal Error: target feature \"%s\" not found",
-              target_feature.c_str());
-
-    tab.target_pos = tab.itable.get_column_offset(target_feature);
-
-    type_node targ_type = tab.itable.get_type(target_feature);
-
-    std::string targ_feat = tab.itable.delete_column(target_feature);
-
-    tab.otable.set_label(targ_feat);
-    tab.otable.set_type(targ_type);
-
-    return in;
-}
-
-/**
- * Like istreamTable but optimize by ignoring features head-on rather
- * than loading them then removing them.
- *
- * Warning: only works on dense data with header file.
- */
-istream& istreamTable_ignore_indices(istream& in, Table& tab,
-                                     const std::string& target_feature,
-                                     const std::vector<unsigned>& ignore_indices)
-{
-    istreamITable_ignore_indices(in, tab.itable, ignore_indices);
-
-    tab.otable = tab.itable.get_column_data(target_feature);
-    OC_ASSERT(0 != tab.otable.size(),
-              "Fatal Error: target feature \"%s\" not found",
-              target_feature.c_str());
-
-    tab.target_pos = tab.itable.get_column_offset(target_feature);
-
-    type_node targ_type = tab.itable.get_type(target_feature);
-
-    std::string targ_feat = tab.itable.delete_column(target_feature);
-
-    tab.otable.set_label(targ_feat);
-    tab.otable.set_type(targ_type);
+	if (file_name.empty())
+		throw RuntimeException(TRACE_INFO, "The file name is empty!");
+	std::ifstream in(file_name.c_str());
+	if (not in.is_open())
+		throw RuntimeException(TRACE_INFO,
+			"Could not open %s", file_name.c_str());
 
-    return in;
+	ITable res;
+	istreamITable(in, res, ignore_features);
+	return res;
 }
 
 // ==================================================================

From 7f22f38ee8a501329406cf6d383711a5d092b9f6 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 19:47:13 +0300
Subject: [PATCH 17/56] Whitespace rework

---
 opencog/persist/csv/table_read.cc | 264 ++++++++++++++----------------
 opencog/persist/csv/table_read.h  |  13 +-
 2 files changed, 130 insertions(+), 147 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 1c138d36bf..1b776ef19d 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -43,6 +43,7 @@
 #include <opencog/util/oc_omp.h>
 #include <opencog/util/comprehension.h>
 
+#include <opencog/atoms/base/Handle.h>
 #include <opencog/atoms/value/BoolValue.h>
 #include <opencog/atoms/value/FloatValue.h>
 #include <opencog/atoms/value/StringValue.h>
@@ -468,6 +469,8 @@ std::vector<std::string> get_header(const std::string& file_name)
 	return tokenizeRow<std::string>(line);
 }
 
+#if 0
+
 /**
  * Fill the input table only, given a DSV (delimiter-seperated values)
  * file format, where delimiters are ',', ' ' or '\t'.
@@ -512,139 +515,125 @@ std::istream& istreamITable(std::istream& in, ITable& tab,
 	return in;
 }
 
-ITable loadITable(const std::string& file_name,
-				  const std::vector<std::string>& ignore_features)
-{
-	if (file_name.empty())
-		throw RuntimeException(TRACE_INFO, "The file name is empty!");
-	std::ifstream in(file_name.c_str());
-	if (not in.is_open())
-		throw RuntimeException(TRACE_INFO,
-			"Could not open %s", file_name.c_str());
-
-	ITable res;
-	istreamITable(in, res, ignore_features);
-	return res;
-}
+#endif
 
 // ==================================================================
 
-static istream&
-inferTableAttributes(istream& in, const std::string& target_feature,
-                     const std::string& timestamp_feature,
+static std::istream&
+inferTableAttributes(std::istream& in,
                      const vector<string>& ignore_features,
-                     type_tree& tt, bool& has_header, bool& is_sparse)
+                     std::vector<Type>& tt, bool& has_header)
 {
-    // maxline is the maximum number of lines to read to infer the
-    // attributes. A negative number means reading all lines.
-    int maxline = 20;
-    streampos beg = in.tellg();
 
-    // Get a portion of the dataset into memory (cleaning weird stuff)
-    std::vector<string> lines;
-    {
-        std::string line;
-        is_sparse = false;
-        while (get_data_line(in, line) && maxline-- > 0) {
-            // It is sparse
-            is_sparse = is_sparse || std::string::npos != line.find(sparse_delim);
-            if (is_sparse) { // just get out
-                // TODO could be simplified, optimized, etc
-                in.seekg(beg);
-                in.clear();         // in case it has reached the eof
-                return in;
-            }
-
-            // put the line in a buffer
-            lines.push_back(line);
-        }
-    }
+	// maxline is the maximum number of lines to read to infer the
+	// attributes. A negative number means reading all lines.
+	int maxline = 20;
+	streampos beg = in.tellg();
 
-    // parse what could be a header
-    const vector<string> maybe_header = tokenizeRow<string>(lines.front());
-
-    // determine arity
-    arity_t arity = maybe_header.size();
-    std::atomic<int> arity_fail_row(-1);
-
-    // determine initial type
-    type_node_seq types(arity, id::unknown_type);
-
-    // parse the rest, determine its type and whether the arity is
-    // consistent
-    for (size_t i = 1; i < lines.size(); ++i) {
-        // Parse line
-        const string_seq& tokens = tokenizeRow<string>(lines[i]);
-
-        // Check arity
-        if (arity != (arity_t)tokens.size()) {
-            arity_fail_row = i + 1;
-            in.seekg(beg);
-            in.clear();         // in case it has reached the eof
-            OC_ASSERT(false,
-                      "ERROR: Input file inconsistent: the %uth row has a "
-                      "different number of columns than the rest of the file.  "
-                      "All rows should have the same number of columns.\n",
-                      arity_fail_row.load());
-        }
+	// Get a portion of the dataset into memory (cleaning weird stuff)
+	std::vector<string> lines;
+	{
+		std::string line;
+		while (get_data_line(in, line) && maxline-- > 0) {
+			// It is sparse
+			is_sparse = is_sparse || std::string::npos != line.find(sparse_delim);
+			if (is_sparse) { // just get out
+				// TODO could be simplified, optimized, etc
+				in.seekg(beg);
+				in.clear();		 // in case it has reached the eof
+				return in;
+			}
+
+			// put the line in a buffer
+			lines.push_back(line);
+		}
+	}
 
-        // Infer type
-        boost::transform(types, tokens, types.begin(),
-                         infer_type_from_token2);
-    }
+	// parse what could be a header
+	const vector<string> maybe_header = tokenizeRow<string>(lines.front());
 
-    // Determine has_header
-    has_header = is_header(maybe_header, types);
+	// determine arity
+	arity_t arity = maybe_header.size();
+	std::atomic<int> arity_fail_row(-1);
 
-    // Determine type signature
-    if (has_header) {
+	// determine initial type
+	type_node_seq types(arity, id::unknown_type);
 
-        // if unspecified, the target is the first column
-        unsigned target_idx = 0;
+	// parse the rest, determine its type and whether the arity is
+	// consistent
+	for (size_t i = 1; i < lines.size(); ++i) {
+		// Parse line
+		const string_seq& tokens = tokenizeRow<string>(lines[i]);
 
-        // target feature will be ignored
-        if (!target_feature.empty()) {
-            auto target_it = std::find(maybe_header.begin(), maybe_header.end(),
-                                       target_feature);
-            OC_ASSERT(target_it != maybe_header.end(), "Target %s not found",
-                      target_feature.c_str());
-            target_idx = std::distance(maybe_header.begin(), target_it);
-        }
-        vector<unsigned> ignore_idxs =
-            get_indices(ignore_features, maybe_header);
-        ignore_idxs.push_back(target_idx);
-        boost::sort(ignore_idxs);
+		// Check arity
+		if (arity != (arity_t)tokens.size()) {
+			arity_fail_row = i + 1;
+			in.seekg(beg);
+			in.clear();		 // in case it has reached the eof
+			OC_ASSERT(false,
+					  "ERROR: Input file inconsistent: the %uth row has a "
+					  "different number of columns than the rest of the file.  "
+					  "All rows should have the same number of columns.\n",
+					  arity_fail_row.load());
+		}
 
-        // Include timestamp feature as idx to ignore
-        if (!timestamp_feature.empty()) {
-            auto timestamp_it = std::find(maybe_header.begin(), maybe_header.end(),
-                                          timestamp_feature);
-            OC_ASSERT(timestamp_it != maybe_header.end(),
-                      "Timestamp feature  %s not found",
-                      timestamp_feature.c_str());
-            unsigned timestamp_idx = std::distance(maybe_header.begin(), timestamp_it);
-            ignore_idxs.push_back(timestamp_idx);
-            boost::sort(ignore_idxs);
-        }
+		// Infer type
+		boost::transform(types, tokens, types.begin(),
+						 infer_type_from_token2);
+	}
 
-        // Generate type signature
-        type_node otype = types[target_idx];
-        type_node_seq itypes;
-        for (unsigned i = 0; i < types.size(); ++i)
-            if (!boost::binary_search(ignore_idxs, i))
-                itypes.push_back(types[i]);
-        tt = gen_signature(itypes, otype);
-    } else {
-        // No header, the target is the first column
-        type_node otype = types[0];
-        types.erase(types.begin());
-        tt = gen_signature(types, otype);
-    }
-    logger().debug() << "Infered type tree: " << tt;
+	// Determine has_header
+	has_header = is_header(maybe_header, types);
 
-    in.seekg(beg);
-    in.clear();         // in case it has reached the eof
-    return in;
+	// Determine type signature
+	if (has_header) {
+
+		// if unspecified, the target is the first column
+		unsigned target_idx = 0;
+
+		// target feature will be ignored
+		if (!target_feature.empty()) {
+			auto target_it = std::find(maybe_header.begin(), maybe_header.end(),
+									   target_feature);
+			OC_ASSERT(target_it != maybe_header.end(), "Target %s not found",
+					  target_feature.c_str());
+			target_idx = std::distance(maybe_header.begin(), target_it);
+		}
+		vector<unsigned> ignore_idxs =
+			get_indices(ignore_features, maybe_header);
+		ignore_idxs.push_back(target_idx);
+		boost::sort(ignore_idxs);
+
+		// Include timestamp feature as idx to ignore
+		if (!timestamp_feature.empty()) {
+			auto timestamp_it = std::find(maybe_header.begin(), maybe_header.end(),
+										  timestamp_feature);
+			OC_ASSERT(timestamp_it != maybe_header.end(),
+					  "Timestamp feature  %s not found",
+					  timestamp_feature.c_str());
+			unsigned timestamp_idx = std::distance(maybe_header.begin(), timestamp_it);
+			ignore_idxs.push_back(timestamp_idx);
+			boost::sort(ignore_idxs);
+		}
+
+		// Generate type signature
+		type_node otype = types[target_idx];
+		type_node_seq itypes;
+		for (unsigned i = 0; i < types.size(); ++i)
+			if (!boost::binary_search(ignore_idxs, i))
+				itypes.push_back(types[i]);
+		tt = gen_signature(itypes, otype);
+	} else {
+		// No header, the target is the first column
+		type_node otype = types[0];
+		types.erase(types.begin());
+		tt = gen_signature(types, otype);
+	}
+	logger().debug() << "Infered type tree: " << tt;
+
+	in.seekg(beg);
+	in.clear();		 // in case it has reached the eof
+	return in;
 }
 
 /**
@@ -657,17 +646,15 @@ inferTableAttributes(istream& in, const std::string& target_feature,
  *
  * 2) Load the actual data.
  */
-istream& istreamTable(istream& in, Table& tab,
-                      const std::string& target_feature,
-                      const std::string& timestamp_feature,
-                      const std::vector<std::string>& ignore_features)
+std::istream&
+istreamTable(const Handle& anchor,
+             std::istream& in,
+             const std::vector<std::string>& ignore_features)
 {
     // Infer the properties of the table without loading its content
-    type_tree tt;
-    bool has_header, is_sparse;
-    streampos beg = in.tellg();
-    inferTableAttributes(in, target_feature, timestamp_feature,
-                         ignore_features, tt, has_header, is_sparse);
+    bool has_header;
+    std::streampos beg = in.tellg();
+    inferTableAttributes(in, ignore_features, tt, has_header);
     in.seekg(beg);
 
     if (is_sparse) {
@@ -683,6 +670,7 @@ istream& istreamTable(istream& in, Table& tab,
 
 // ==================================================================
 
+#if 0
 /**
  * Take a line and return a pair with vector containing the input
  * elements and then output element.
@@ -847,20 +835,22 @@ istream& istreamDenseTable(istream& in, Table& tab,
                                       ignore_idxs, tt, has_header);
 }
 
+#endif
+
 // ==================================================================
 
-Table loadTable(const std::string& file_name,
-                const std::string& target_feature,
-                const std::string& timestamp_feature,
-                const string_seq& ignore_features)
+void loadTable(const Handle& anchor,
+               const std::string& file_name,
+               const string_seq& ignore_features)
 {
-    OC_ASSERT(!file_name.empty(), "the file name is empty");
-    ifstream in(file_name.c_str());
-    OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str());
+	if (file_name.empty())
+		throw RuntimeException(TRACE_INFO, "The file name is empty!");
+	std::ifstream in(file_name.c_str());
+	if (not in.is_open())
+		throw RuntimeException(TRACE_INFO,
+			"Could not open %s", file_name.c_str());
 
-    Table res;
-    istreamTable(in, res, target_feature, timestamp_feature, ignore_features);
-    return res;
+    istreamTable(acnhro, in, ignore_features);
 }
 
 // ==================================================================
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index 4aa885b14d..7b8164a84b 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -119,17 +119,10 @@ Table loadITable_optimized(
     const string_seq& ignore_features=string_seq());
 
 /**
- * If target_feature is empty then, in case there is no header, it is
- * assumed to be the first feature.
  */
-Table loadTable(
-    const std::string& file_name,
-    const string_seq& ignore_features=string_seq());
-
-std::istream& istreamDenseTable(std::istream&, Table&,
-                                const string_seq& ignore_features,
-                                const std::vector<Type>&, bool has_header);
-
+void loadTable(const Handle& anchor,
+               const std::string& file_name,
+               const string_seq& ignore_features=string_seq());
 
 } // ~namespaces opencog
 

From 277fec8c4e93a87315f30008fbe0c1bbb3dbc704 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 19:59:25 +0300
Subject: [PATCH 18/56] Convert and simplify table reading

---
 opencog/persist/csv/table_read.cc | 105 +++++++++---------------------
 1 file changed, 30 insertions(+), 75 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 1b776ef19d..53d9ac7c43 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -521,113 +521,68 @@ std::istream& istreamITable(std::istream& in, ITable& tab,
 
 static std::istream&
 inferTableAttributes(std::istream& in,
-                     const vector<string>& ignore_features,
+                     const std::vector<std::string>& ignore_features,
                      std::vector<Type>& tt, bool& has_header)
 {
+	has_header = false;
+
+	std::streampos beg = in.tellg();
 
 	// maxline is the maximum number of lines to read to infer the
 	// attributes. A negative number means reading all lines.
 	int maxline = 20;
-	streampos beg = in.tellg();
 
 	// Get a portion of the dataset into memory (cleaning weird stuff)
-	std::vector<string> lines;
-	{
-		std::string line;
-		while (get_data_line(in, line) && maxline-- > 0) {
-			// It is sparse
-			is_sparse = is_sparse || std::string::npos != line.find(sparse_delim);
-			if (is_sparse) { // just get out
-				// TODO could be simplified, optimized, etc
-				in.seekg(beg);
-				in.clear();		 // in case it has reached the eof
-				return in;
-			}
-
-			// put the line in a buffer
-			lines.push_back(line);
-		}
-	}
+	std::vector<std::string> lines;
+	std::string line;
+	while (get_data_line(in, line) && maxline-- > 0)
+		lines.push_back(line);
 
-	// parse what could be a header
-	const vector<string> maybe_header = tokenizeRow<string>(lines.front());
+	// Parse what could be a header
+	const std::vector<std::string> maybe_header =
+		tokenizeRow<std::string>(lines.front());
 
-	// determine arity
-	arity_t arity = maybe_header.size();
+	// Determine arity
+	size_t arity = maybe_header.size();
 	std::atomic<int> arity_fail_row(-1);
 
-	// determine initial type
-	type_node_seq types(arity, id::unknown_type);
+	// Determine initial type
+	std::vector<Type> types(arity, VOID_VALUE);
 
-	// parse the rest, determine its type and whether the arity is
+	// Parse the rest, determine its type and whether the arity is
 	// consistent
-	for (size_t i = 1; i < lines.size(); ++i) {
+	for (size_t i = 1; i < lines.size(); ++i)
+	{
 		// Parse line
-		const string_seq& tokens = tokenizeRow<string>(lines[i]);
+		const string_seq& tokens = tokenizeRow<std::string>(lines[i]);
 
 		// Check arity
-		if (arity != (arity_t)tokens.size()) {
+		if (arity != tokens.size())
+		{
 			arity_fail_row = i + 1;
 			in.seekg(beg);
 			in.clear();		 // in case it has reached the eof
-			OC_ASSERT(false,
-					  "ERROR: Input file inconsistent: the %uth row has a "
-					  "different number of columns than the rest of the file.  "
-					  "All rows should have the same number of columns.\n",
-					  arity_fail_row.load());
+			throw SyntaxException(TRACE_INFO,
+				"ERROR: Input file inconsistent: the %uth row has a "
+				"different number of columns than the rest of the file.  "
+				"All rows should have the same number of columns.\n",
+				arity_fail_row.load());
 		}
 
 		// Infer type
 		boost::transform(types, tokens, types.begin(),
-						 infer_type_from_token2);
+		                 infer_type_from_token2);
 	}
 
 	// Determine has_header
 	has_header = is_header(maybe_header, types);
 
 	// Determine type signature
-	if (has_header) {
-
-		// if unspecified, the target is the first column
-		unsigned target_idx = 0;
-
-		// target feature will be ignored
-		if (!target_feature.empty()) {
-			auto target_it = std::find(maybe_header.begin(), maybe_header.end(),
-									   target_feature);
-			OC_ASSERT(target_it != maybe_header.end(), "Target %s not found",
-					  target_feature.c_str());
-			target_idx = std::distance(maybe_header.begin(), target_it);
-		}
-		vector<unsigned> ignore_idxs =
+	if (has_header)
+	{
+		std::vector<unsigned> ignore_idxs =
 			get_indices(ignore_features, maybe_header);
-		ignore_idxs.push_back(target_idx);
 		boost::sort(ignore_idxs);
-
-		// Include timestamp feature as idx to ignore
-		if (!timestamp_feature.empty()) {
-			auto timestamp_it = std::find(maybe_header.begin(), maybe_header.end(),
-										  timestamp_feature);
-			OC_ASSERT(timestamp_it != maybe_header.end(),
-					  "Timestamp feature  %s not found",
-					  timestamp_feature.c_str());
-			unsigned timestamp_idx = std::distance(maybe_header.begin(), timestamp_it);
-			ignore_idxs.push_back(timestamp_idx);
-			boost::sort(ignore_idxs);
-		}
-
-		// Generate type signature
-		type_node otype = types[target_idx];
-		type_node_seq itypes;
-		for (unsigned i = 0; i < types.size(); ++i)
-			if (!boost::binary_search(ignore_idxs, i))
-				itypes.push_back(types[i]);
-		tt = gen_signature(itypes, otype);
-	} else {
-		// No header, the target is the first column
-		type_node otype = types[0];
-		types.erase(types.begin());
-		tt = gen_signature(types, otype);
 	}
 	logger().debug() << "Infered type tree: " << tt;
 

From e45aa9f925ae0bda4006fa240dc9dab4c9f5cd14 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 20:20:43 +0300
Subject: [PATCH 19/56] More cleanup

---
 opencog/persist/csv/table_read.cc | 122 ++++++++++--------------------
 opencog/persist/csv/table_read.h  |  15 +---
 2 files changed, 40 insertions(+), 97 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 53d9ac7c43..c7d2dd253e 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -519,10 +519,30 @@ std::istream& istreamITable(std::istream& in, ITable& tab,
 
 // ==================================================================
 
+/**
+ * Get indices (aka positions or offsets) of a list of labels given a
+ * header. The labels can be sequenced in any order, it will always
+ * return the order consistent with the header.
+ */
+static std::vector<unsigned>
+get_indices(const string_seq &labels,
+            const string_seq &header)
+{
+   std::vector<unsigned> res;
+   for (size_t i = 0; i < header.size(); ++i)
+      if (std::find(labels.begin(), labels.end(), header[i]) != labels.end())
+         res.push_back(i);
+   return res;
+}
+
+// ==================================================================
+
 static std::istream&
 inferTableAttributes(std::istream& in,
                      const std::vector<std::string>& ignore_features,
-                     std::vector<Type>& tt, bool& has_header)
+                     std::vector<unsigned>& ignore_idxs,
+                     std::vector<Type>& tt,
+                     bool& has_header)
 {
 	has_header = false;
 
@@ -580,11 +600,9 @@ inferTableAttributes(std::istream& in,
 	// Determine type signature
 	if (has_header)
 	{
-		std::vector<unsigned> ignore_idxs =
-			get_indices(ignore_features, maybe_header);
+		ignore_idxs = get_indices(ignore_features, maybe_header);
 		boost::sort(ignore_idxs);
 	}
-	logger().debug() << "Infered type tree: " << tt;
 
 	in.seekg(beg);
 	in.clear();		 // in case it has reached the eof
@@ -606,21 +624,17 @@ istreamTable(const Handle& anchor,
              std::istream& in,
              const std::vector<std::string>& ignore_features)
 {
-    // Infer the properties of the table without loading its content
-    bool has_header;
-    std::streampos beg = in.tellg();
-    inferTableAttributes(in, ignore_features, tt, has_header);
-    in.seekg(beg);
-
-    if (is_sparse) {
-        // fallback on the old loader
-        // TODO: this could definitely be optimized
-        OC_ASSERT(timestamp_feature.empty(), "Timestamp feature not implemented");
-        return istreamTable_OLD(in, tab, target_feature, ignore_features);
-    } else {
-        return istreamDenseTable(in, tab, target_feature, timestamp_feature,
-                                 ignore_features, tt, has_header);
-    }
+	std::streampos beg = in.tellg();
+
+	// Infer the properties of the table without loading its content
+	bool has_header = false;
+	std::vector<unsigned> ignore_indexes;
+	std::vector<Type> col_types;
+	inferTableAttributes(in, ignore_features, ignore_indexes,
+	                     col_types, has_header);
+	in.seekg(beg);
+
+	return istreamDenseTable(anchor, in, ignore_indexes, col_types, has_header);
 }
 
 // ==================================================================
@@ -655,12 +669,10 @@ tokenizeRowIO (
 
 // ==================================================================
 
-static istream&
-istreamDenseTable_noHeader(istream& in, Table& tab,
-                           int target_idx, // < 0 == ignore
-                           int timestamp_idx, // < 0 == ignore
-                           const vector<unsigned>& ignore_idxs,
-                           const type_tree& tt, bool has_header)
+static std::istream&
+istreamDenseTable(istream& in, Table& tab,
+                  const vector<unsigned>& ignore_idxs,
+                  const type_tree& tt, bool has_header)
 {
     // Get the entire dataset into memory (cleaning weird stuff)
     std::string line;
@@ -732,64 +744,6 @@ istreamDenseTable_noHeader(istream& in, Table& tab,
     return in;
 }
 
-istream& istreamDenseTable(istream& in, Table& tab,
-                           const std::string& target_feature,
-                           const std::string& timestamp_feature,
-                           const vector<string>& ignore_features,
-                           const type_tree& tt, bool has_header)
-{
-    OC_ASSERT(has_header
-              || (target_feature.empty()
-                  && ignore_features.empty()
-                  && timestamp_feature.empty()),
-              "If the data file has no header, "
-              "then a target feature, ignore features or "
-              "timestamp_feature cannot be specified");
-
-    // determine target, timestamp and ignore indexes
-    int target_idx = 0;    // if no header, target is at the first
-                           // column by default
-
-    int timestamp_idx = -1;     // disabled by default
-    vector<unsigned> ignore_idxs;
-    if (has_header) {
-        std::string line;
-        get_data_line(in, line);
-        vector<string> header = tokenizeRow<string>(line);
-
-        // Set target idx
-        if (!target_feature.empty()) {
-            auto target_it = std::find(header.begin(), header.end(),
-                                       target_feature);
-            OC_ASSERT(target_it != header.end(), "Target %s not found",
-                      target_feature.c_str());
-            target_idx = std::distance(header.begin(), target_it);
-        }
-
-        // Set timestamp idx
-        if (!timestamp_feature.empty()) {
-            auto timestamp_it = std::find(header.begin(), header.end(),
-                                          timestamp_feature);
-            OC_ASSERT(timestamp_it != header.end(), "Timestamp feature %s not found",
-                      timestamp_feature.c_str());
-            timestamp_idx = std::distance(header.begin(), timestamp_it);
-        }
-
-        // Set ignore idxs
-        ignore_idxs = get_indices(ignore_features, header);
-
-        // get input and output labels from the header
-        auto iotlabels = tokenizeRowIOT(line, ignore_idxs,
-                                        target_idx, timestamp_idx);
-        tab.itable.set_labels(std::get<0>(iotlabels));
-        tab.otable.set_label(std::get<1>(iotlabels));
-        tab.ttable.set_label(std::get<2>(iotlabels));
-    }
-
-    return istreamDenseTable_noHeader(in, tab, target_idx, timestamp_idx,
-                                      ignore_idxs, tt, has_header);
-}
-
 #endif
 
 // ==================================================================
@@ -805,7 +759,7 @@ void loadTable(const Handle& anchor,
 		throw RuntimeException(TRACE_INFO,
 			"Could not open %s", file_name.c_str());
 
-    istreamTable(acnhro, in, ignore_features);
+    istreamTable(anchor, in, ignore_features);
 }
 
 // ==================================================================
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index 7b8164a84b..10c7e19be5 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -105,21 +105,10 @@ std::istream& istreamRawITable(
 std::istream& istreamITable(std::istream& in, ITable& tab,
                            const string_seq& ignore_features);
 
-std::istream& istreamTable(std::istream& in, Table& tab,
+std::istream& istreamTable(const Handle& anchor,
+                           std::istream& in,
                            const string_seq& ignore_features);
 
-// TODO: reimplement loadITable with the same model of loadTable and
-// remove loadITable_optimized
-Table loadITable(
-    const std::string& file_name,
-    const string_seq& ignore_features=string_seq());
-
-Table loadITable_optimized(
-    const std::string& file_name,
-    const string_seq& ignore_features=string_seq());
-
-/**
- */
 void loadTable(const Handle& anchor,
                const std::string& file_name,
                const string_seq& ignore_features=string_seq());

From 25241339bf53b42056b2cf2fbf20868100c4fcbb Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 20:34:22 +0300
Subject: [PATCH 20/56] Reorder order of teh code

---
 opencog/persist/csv/table_read.cc | 84 ++++++++++++++-----------------
 1 file changed, 37 insertions(+), 47 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index c7d2dd253e..508d50af74 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -609,34 +609,6 @@ inferTableAttributes(std::istream& in,
 	return in;
 }
 
-/**
- * Perform 2 passes:
- *
- * 1) Infer
- * 1.1) its type
- * 1.2) whether it has a header
- * 1.3) whether it is dense or sparse
- *
- * 2) Load the actual data.
- */
-std::istream&
-istreamTable(const Handle& anchor,
-             std::istream& in,
-             const std::vector<std::string>& ignore_features)
-{
-	std::streampos beg = in.tellg();
-
-	// Infer the properties of the table without loading its content
-	bool has_header = false;
-	std::vector<unsigned> ignore_indexes;
-	std::vector<Type> col_types;
-	inferTableAttributes(in, ignore_features, ignore_indexes,
-	                     col_types, has_header);
-	in.seekg(beg);
-
-	return istreamDenseTable(anchor, in, ignore_indexes, col_types, has_header);
-}
-
 // ==================================================================
 
 #if 0
@@ -667,36 +639,26 @@ tokenizeRowIO (
     return res;
 }
 
+#endif
+
 // ==================================================================
 
 static std::istream&
-istreamDenseTable(istream& in, Table& tab,
-                  const vector<unsigned>& ignore_idxs,
-                  const type_tree& tt, bool has_header)
+istreamDenseTable(const Handle& anchor,
+                  std::istream& in,
+                  const std::vector<unsigned>& ignore_idxs,
+                  const std::vector<Type>& col_types,
+                  bool has_header)
 {
     // Get the entire dataset into memory (cleaning weird stuff)
     std::string line;
-    std::vector<string> lines;
+    std::vector<std::string> lines;
     while (get_data_line(in, line))
         lines.push_back(line);
 
-    // Allocate all rows in the itable, otable and ttable
-    tab.itable.resize(lines.size());
-    tab.otable.resize(lines.size());
-    if (timestamp_idx >= 0)
-        tab.ttable.resize(lines.size());
-
     // Get the elementary io types
     type_node_seq itypes =
         vector_comp(get_signature_inputs(tt), get_type_node);
-    type_node otype = get_type_node(get_signature_output(tt));
-
-    // Assign the io type to the table
-    tab.itable.set_types(itypes);
-    tab.otable.set_type(otype);
-
-    // Instantiate type conversion for inputs
-    from_tokens_visitor ftv(itypes);
 
     // Function to parse each line (to be called in parallel)
     auto parse_line = [&](unsigned i) {
@@ -744,7 +706,35 @@ istreamDenseTable(istream& in, Table& tab,
     return in;
 }
 
-#endif
+// ==================================================================
+
+/**
+ * Perform 2 passes:
+ *
+ * 1) Infer
+ * 1.1) its type
+ * 1.2) whether it has a header
+ * 1.3) whether it is dense or sparse
+ *
+ * 2) Load the actual data.
+ */
+std::istream&
+istreamTable(const Handle& anchor,
+             std::istream& in,
+             const std::vector<std::string>& ignore_features)
+{
+	std::streampos beg = in.tellg();
+
+	// Infer the properties of the table without loading its content
+	bool has_header = false;
+	std::vector<unsigned> ignore_indexes;
+	std::vector<Type> col_types;
+	inferTableAttributes(in, ignore_features, ignore_indexes,
+	                     col_types, has_header);
+	in.seekg(beg);
+
+	return istreamDenseTable(anchor, in, ignore_indexes, col_types, has_header);
+}
 
 // ==================================================================
 

From c7b6ca9ab73904ec790fa09c111b16f69d86ffc6 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 20:39:54 +0300
Subject: [PATCH 21/56] Code that compiles.

---
 opencog/persist/csv/table_read.cc | 12 +++++-------
 opencog/persist/csv/table_read.h  |  4 ++--
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 508d50af74..febf61a03e 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -656,10 +656,7 @@ istreamDenseTable(const Handle& anchor,
     while (get_data_line(in, line))
         lines.push_back(line);
 
-    // Get the elementary io types
-    type_node_seq itypes =
-        vector_comp(get_signature_inputs(tt), get_type_node);
-
+#if 0
     // Function to parse each line (to be called in parallel)
     auto parse_line = [&](unsigned i) {
         try {
@@ -702,6 +699,7 @@ istreamDenseTable(const Handle& anchor,
     if (timestamp_idx >= 0)
         tab.timestamp_pos = timestamp_idx -
             boost::count_if(ignore_idxs, arg1 < timestamp_idx);
+#endif
 
     return in;
 }
@@ -719,9 +717,9 @@ istreamDenseTable(const Handle& anchor,
  * 2) Load the actual data.
  */
 std::istream&
-istreamTable(const Handle& anchor,
-             std::istream& in,
-             const std::vector<std::string>& ignore_features)
+opencog::istreamTable(const Handle& anchor,
+                      std::istream& in,
+                      const std::vector<std::string>& ignore_features)
 {
 	std::streampos beg = in.tellg();
 
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index 10c7e19be5..d03425b8a8 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -105,8 +105,8 @@ std::istream& istreamRawITable(
 std::istream& istreamITable(std::istream& in, ITable& tab,
                            const string_seq& ignore_features);
 
-std::istream& istreamTable(const Handle& anchor,
-                           std::istream& in,
+std::istream& istreamTable(const Handle&,
+                           std::istream&,
                            const string_seq& ignore_features);
 
 void loadTable(const Handle& anchor,

From 70b2eaa407f06778c932daffed2e81a178138187 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 20:47:01 +0300
Subject: [PATCH 22/56] Remove unused code

---
 opencog/persist/csv/table_read.cc | 68 ++-----------------------------
 1 file changed, 3 insertions(+), 65 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index febf61a03e..b29e3f0e8d 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -59,20 +59,6 @@ using boost::phoenix::arg_names::arg1;
 
 // -------------------------------------------------------
 
-/**
- * Return true if the next chars in 'in' correspond to carriage return
- * (support UNIX and DOS format) and advance in of the checked chars.
- */
-static bool checkCarriageReturn(std::istream& in)
-{
-	char next_c = in.get();
-	if (next_c == '\r') // DOS format
-		next_c = in.get();
-	if (next_c == '\n')
-		return true;
-	return false;
-}
-
 /**
  * remove the carriage return (for DOS format)
  */
@@ -135,35 +121,6 @@ std::istream& get_data_line(std::istream& is, std::string& line)
 
 // -------------------------------------------------------
 
-static const char *sparse_delim = " : ";
-
-/**
- * parse a pair of key/value in a parse dataset, using ':' as
- * delimiter. For instance
- *
- * parse_key_val("key : val")
- *
- * returns
- *
- * {"key", "val"}
- *
- * If no such delimiter is found then it return a pair with empty key
- * and empty val.
- */
-static std::pair<std::string, std::string>
-parse_key_val(const std::string& chunk)
-{
-	std::pair<std::string, std::string> res;
-	size_t pos = chunk.find(sparse_delim);
-	if (std::string::npos == pos)
-		return res;
-	std::string key = chunk.substr(0, pos);
-	boost::trim(key);
-	std::string val = chunk.substr(pos + strlen(sparse_delim));
-	boost::trim(val);
-	return {key, val};
-}
-
 /**
  * Take a row, return a tokenizer.  Tokenization uses the
  * separator characters comma, blank, tab (',', ' ' or '\t').
@@ -322,6 +279,7 @@ ValuePtr opencog::token_to_vertex(Type tipe, const std::string& token)
 
 // ===========================================================
 // istream regular tables.
+static const char *sparse_delim = " : ";
 
 /**
  * Fill the input table, given a file in DSV (delimiter-seperated values)
@@ -424,33 +382,13 @@ std::vector<Type> infer_column_types(const std::vector<string_seq>& tab)
 	return types;
 }
 
-/**
- * Infer the column types of the first line of a raw input table and
- * compare it to the given column types.  If there is a mis-match,
- * then the first row must be a header, i.e. a set of ascii column
- * labels.
- */
-static bool has_header(ITable& tab, const std::vector<Type>& col_types)
-{
-	const string_seq& row = *tab.begin();
-
-	size_t arity = row.size();
-
-	for (size_t i=0; i<arity; i++)
-	{
-		Type flt = infer_type_from_token2(col_types[i], row[i]);
-		if ((FLOAT_VALUE == flt) && (STRING_VALUE != col_types[i]))
-			return true;
-	}
-	return false;
-}
-
 /**
  * Infer the column types of a line and compare it to the given column
  * types.  If there is a mis-match, then it must be a header, i.e. a
  * set of ascii column labels.
  */
-bool is_header(const string_seq& tokens, const std::vector<Type>& col_types)
+static bool
+is_header(const string_seq& tokens, const std::vector<Type>& col_types)
 {
 	for (size_t i = 0; i < tokens.size(); i++)
 	{

From b4789caca68d8ffcc70823c8b2cae58402f908d8 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 20:57:40 +0300
Subject: [PATCH 23/56] Remove more dead code

---
 opencog/persist/csv/table_read.cc | 123 ++++++------------------------
 1 file changed, 23 insertions(+), 100 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index b29e3f0e8d..56ac84786e 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -407,54 +407,6 @@ std::vector<std::string> get_header(const std::string& file_name)
 	return tokenizeRow<std::string>(line);
 }
 
-#if 0
-
-/**
- * Fill the input table only, given a DSV (delimiter-seperated values)
- * file format, where delimiters are ',', ' ' or '\t'.
- *
- * This algorithm makes several passes over the data.  First, it reads
- * the entire table, as a collection of strings.  Next, it tries to
- * infer the column types, and the presence of a header.
- */
-std::istream& istreamITable(std::istream& in, ITable& tab,
-                            const std::vector<std::string>& ignore_features)
-{
-	istreamRawITable(in, tab);
-
-	// Determine the column types.
-	std::vector<Type> col_types = infer_column_types(tab);
-
-	// If there is a header row, then it must be the column labels.
-	if (has_header(tab, col_types))
-	{
-		// tab.set_labels(*tab.begin());
-		tab.erase(tab.begin());
-	}
-
-	// Now that we have some column labels to work off of,
-	// Get rid of the unwanted columns.
-	// tab.delete_columns(ignore_features);
-
-	// determined ignore_indices
-	std::vector<unsigned> ignore_indices = get_indices(ignore_features,
-		  										  get_header(file_name));
-
-
-....
-	// Finally, perform a column type conversion
-	from_tokens_visitor ftv(tab.get_types());
-	auto aft = apply_visitor(ftv);
-	OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(),
-						[&](multi_type_seq& seq) {
-							return aft(seq.get_variant());
-						});
-
-	return in;
-}
-
-#endif
-
 // ==================================================================
 
 /**
@@ -549,38 +501,6 @@ inferTableAttributes(std::istream& in,
 
 // ==================================================================
 
-#if 0
-/**
- * Take a line and return a pair with vector containing the input
- * elements and then output element.
- */
-template<typename T>
-std::pair<std::vector<T>, T>
-tokenizeRowIO (
-    const std::string& line,
-    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>(),
-    unsigned target_idx=0)
-{
-    std::pair<std::vector<T>, T> res;
-    table_tokenizer toker = get_row_tokenizer(line);
-    size_t i = 0;
-    for (const std::string& tok : toker) {
-        if (!boost::binary_search(ignored_indices, i)) {
-            T el = boost::lexical_cast<T>(tok);
-            if (target_idx == i)
-                res.second = el;
-            else
-                res.first.push_back(el);
-        }
-        i++;
-    }
-    return res;
-}
-
-#endif
-
-// ==================================================================
-
 static std::istream&
 istreamDenseTable(const Handle& anchor,
                   std::istream& in,
@@ -588,11 +508,29 @@ istreamDenseTable(const Handle& anchor,
                   const std::vector<Type>& col_types,
                   bool has_header)
 {
-    // Get the entire dataset into memory (cleaning weird stuff)
-    std::string line;
-    std::vector<std::string> lines;
-    while (get_data_line(in, line))
-        lines.push_back(line);
+	std::string line;
+
+	// Assume the stream is at the begining.
+	// If there is a header, skip one line.
+	if (has_header)
+		get_data_line(in, line);
+
+	// Loop over all lines in the table, one by one.
+	while (get_data_line(in, line))
+	{
+		table_tokenizer toker = get_row_tokenizer(line);
+		size_t i = 0;
+		for (const std::string& tok : toker) {
+			if (!boost::binary_search(ignored_indices, i)) {
+				T el = boost::lexical_cast<T>(tok);
+				if (target_idx == i)
+					res.second = el;
+				else
+					res.first.push_back(el);
+			}
+			i++;
+		}
+	}
 
 #if 0
     // Function to parse each line (to be called in parallel)
@@ -601,21 +539,6 @@ istreamDenseTable(const Handle& anchor,
             // Fill input
             auto tokenIOT = tokenizeRowIOT(lines[i], ignore_idxs,
                                            target_idx, timestamp_idx);
-            tab.itable[i] = ftv(std::get<0>(tokenIOT));
-
-            // Fill output
-            std::string output_str = std::get<1>(tokenIOT);
-            // If there is no valid target index, then there is no
-            // "output" column!
-            if (""  != output_str)
-                tab.otable[i] = token_to_vertex(otype, output_str);
-
-            // Fill date
-            std::string date_str = std::get<2>(tokenIOT);
-            // If there is no valid timestamp index, then there is no
-            // "output" column!
-            if (""  != date_str)
-                tab.ttable[i] = TTable::from_string(date_str);
         }
         catch (AssertionException& ex) {
             unsigned lineno = has_header? i+1 : i;

From 41a43a1b3af12eeaa4bb396b74c284a6feef2139 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 21:16:13 +0300
Subject: [PATCH 24/56] Prepare columns that will be filled in.

---
 opencog/persist/csv/table_read.cc | 36 ++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 56ac84786e..c712ee8adc 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -125,7 +125,7 @@ std::istream& get_data_line(std::istream& is, std::string& line)
  * Take a row, return a tokenizer.  Tokenization uses the
  * separator characters comma, blank, tab (',', ' ' or '\t').
  */
-table_tokenizer get_row_tokenizer(const std::string& line)
+table_tokenizer opencog::get_row_tokenizer(const std::string& line)
 {
 	typedef boost::escaped_list_separator<char> separator;
 	typedef boost::tokenizer<separator> tokenizer;
@@ -508,6 +508,38 @@ istreamDenseTable(const Handle& anchor,
                   const std::vector<Type>& col_types,
                   bool has_header)
 {
+	// Width of table in the input.
+	size_t table_width = col_types.size();
+
+	// Effective width is the width, without the ignored columns.
+	size_t effective_width = table_width - ignore_idxs.size();
+
+	// Setup a mask; should we skip the column?
+	std::vector<bool> skip_col(table_width, false);
+	for (unsigned i : ignore_idxs)
+		skip_col[i] = true;
+
+	// Set up typed columns.
+	std::vector<std::vector<bool>> bool_cols;
+	std::vector<std::vector<double>> float_cols;
+	std::vector<std::vector<std::string>> string_cols;
+
+	for (size_t ic = 0; ic < table_width; ic++)
+	{
+		if (skip_col[ic]) continue;
+		if (BOOL_VALUE == col_types[ic])
+			bool_cols.push_back(std::vector<bool>());
+		else
+		if (FLOAT_VALUE == col_types[ic])
+			float_cols.push_back(std::vector<double>());
+		else
+		if (STRING_VALUE == col_types[ic])
+			string_cols.push_back(std::vector<std::string>());
+		else
+			throw RuntimeException(TRACE_INFO,
+				"Unhandled column type");
+	}
+
 	std::string line;
 
 	// Assume the stream is at the begining.
@@ -519,6 +551,7 @@ istreamDenseTable(const Handle& anchor,
 	while (get_data_line(in, line))
 	{
 		table_tokenizer toker = get_row_tokenizer(line);
+#if 0
 		size_t i = 0;
 		for (const std::string& tok : toker) {
 			if (!boost::binary_search(ignored_indices, i)) {
@@ -530,6 +563,7 @@ istreamDenseTable(const Handle& anchor,
 			}
 			i++;
 		}
+#endif
 	}
 
 #if 0

From 635a4796d2d3dc34314273d0f84e44afe33714ea Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 21:27:49 +0300
Subject: [PATCH 25/56] Read boolean columns in the table

---
 opencog/persist/csv/table_read.cc | 105 ++++++++++++++----------------
 1 file changed, 48 insertions(+), 57 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index c712ee8adc..b5bf7979ce 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -230,53 +230,6 @@ infer_type_from_token2(Type curr_guess, const std::string& token)
     return STRING_VALUE;
 }
 
-/// cast string "token" to a vertex of type "tipe"
-static ValuePtr token_to_boolean(const std::string& token)
-{
-	if ("0" == token || "F" == token || "f" == token)
-		return createBoolValue(false);
-
-	if ("1" == token || "T" == token || "t" == token)
-		return createBoolValue(true);
-
-	throw SyntaxException(TRACE_INFO,
-		"Expecting boolean value, got %s", token.c_str());
-}
-
-static ValuePtr token_to_contin(const std::string& token)
-{
-	try {
-		return createFloatValue(lexical_cast<double>(token));
-	} catch (boost::bad_lexical_cast&) {
-		throw SyntaxException(TRACE_INFO,
-			"Could not cast %s to floating point", token.c_str());
-	}
-}
-
-ValuePtr opencog::token_to_vertex(Type tipe, const std::string& token)
-{
-	if (BOOL_VALUE == tipe)
-		return token_to_boolean(token);
-
-	if (FLOAT_VALUE == tipe)
-		return token_to_contin(token);
-
-	if (STRING_VALUE == tipe)
-	{
-		// Enum types must begin with an alpha character
-		if (isalpha(token[0]))
-			return createStringValue(token);
-
-		throw SyntaxException(TRACE_INFO,
-			"Enum type must begin with alphabetic char, but %s doesn't",
-			token.c_str());
-	}
-
-	throw SyntaxException(TRACE_INFO,
-		"Unable to convert token \"%s\" to type=%d",
-		token.c_str(), tipe);
-}
-
 // ===========================================================
 // istream regular tables.
 static const char *sparse_delim = " : ";
@@ -501,6 +454,30 @@ inferTableAttributes(std::istream& in,
 
 // ==================================================================
 
+/// cast string "token" to a vertex of type "tipe"
+static bool token_to_bool(const std::string& token)
+{
+	if ("0" == token || "F" == token || "f" == token)
+		return false;
+
+	if ("1" == token || "T" == token || "t" == token)
+		return true;
+
+	throw SyntaxException(TRACE_INFO,
+		"Expecting boolean value, got %s", token.c_str());
+}
+
+static double token_to_contin(const std::string& token)
+{
+	try {
+		return boost::lexical_cast<double>(token);
+	} catch (boost::bad_lexical_cast&) {
+		throw SyntaxException(TRACE_INFO,
+			"Could not cast %s to floating point", token.c_str());
+	}
+}
+
+
 static std::istream&
 istreamDenseTable(const Handle& anchor,
                   std::istream& in,
@@ -551,19 +528,33 @@ istreamDenseTable(const Handle& anchor,
 	while (get_data_line(in, line))
 	{
 		table_tokenizer toker = get_row_tokenizer(line);
+		size_t ic = 0;
+		size_t bc = 0;
+		size_t fc = 0;
+		size_t sc = 0;
+		for (const std::string& tok : toker)
+		{
+			if (skip_col[ic]) { ic++; continue; }
+			if (BOOL_VALUE == col_types[ic])
+			{
+				bool_cols[bc].push_back(token_to_bool(tok));
+				bc ++;
+				ic ++;
+				continue;
+			}
 #if 0
-		size_t i = 0;
-		for (const std::string& tok : toker) {
-			if (!boost::binary_search(ignored_indices, i)) {
+			else
+			if (FLOAT_VALUE == col_types[ic])
+				float_cols.push_back(std::vector<double>());
+			else
+			if (STRING_VALUE == col_types[ic])
+				string_cols.push_back(std::vector<std::string>());
+
+xxx
 				T el = boost::lexical_cast<T>(tok);
-				if (target_idx == i)
-					res.second = el;
-				else
-					res.first.push_back(el);
-			}
-			i++;
-		}
+				res.first.push_back(el);
 #endif
+		}
 	}
 
 #if 0

From 02bb9bd5becd2e2fdec292aa83c827b2381e8add Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 21:30:33 +0300
Subject: [PATCH 26/56] Handle the remaining column types

---
 opencog/persist/csv/table_read.cc | 57 +++++++++----------------------
 1 file changed, 17 insertions(+), 40 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index b5bf7979ce..3b5d785fad 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -542,52 +542,29 @@ istreamDenseTable(const Handle& anchor,
 				ic ++;
 				continue;
 			}
-#if 0
-			else
+
 			if (FLOAT_VALUE == col_types[ic])
-				float_cols.push_back(std::vector<double>());
-			else
+			{
+				float_cols[fc].push_back(token_to_contin(tok));
+				fc ++;
+				ic ++;
+				continue;
+			}
+
 			if (STRING_VALUE == col_types[ic])
-				string_cols.push_back(std::vector<std::string>());
+			{
+				string_cols[sc].push_back(tok);
+				sc ++;
+				ic ++;
+				continue;
+			}
 
-xxx
-				T el = boost::lexical_cast<T>(tok);
-				res.first.push_back(el);
-#endif
+			throw RuntimeException(TRACE_INFO,
+				"Unhandled column type");
 		}
 	}
 
-#if 0
-    // Function to parse each line (to be called in parallel)
-    auto parse_line = [&](unsigned i) {
-        try {
-            // Fill input
-            auto tokenIOT = tokenizeRowIOT(lines[i], ignore_idxs,
-                                           target_idx, timestamp_idx);
-        }
-        catch (AssertionException& ex) {
-            unsigned lineno = has_header? i+1 : i;
-            OC_ASSERT(false, "Parsing error occurred on line %d of input file\n"
-                             "Exception: %s", lineno, ex.what());
-        }
-    };
-
-    // Call it for each line in parallel
-    auto ir = boost::irange((size_t)0, lines.size());
-    vector<size_t> row_idxs(ir.begin(), ir.end());
-    OMP_ALGO::for_each(row_idxs.begin(), row_idxs.end(), parse_line);
-
-    // Assign the target position relative to the ignored indices
-    // (useful for writing that file back)
-    tab.target_pos = target_idx - boost::count_if(ignore_idxs,
-                                                  arg1 < target_idx);
-
-    if (timestamp_idx >= 0)
-        tab.timestamp_pos = timestamp_idx -
-            boost::count_if(ignore_idxs, arg1 < timestamp_idx);
-#endif
-
-    return in;
+	return in;
 }
 
 // ==================================================================

From 9c16a7530499b88552f489f66d7fd5dade0d2130 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 21:47:36 +0300
Subject: [PATCH 27/56] Stub out or remove dead code

---
 opencog/persist/csv/table_read.cc | 26 ++++++-----------
 opencog/persist/csv/table_read.h  | 47 +++++++++++--------------------
 2 files changed, 25 insertions(+), 48 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 3b5d785fad..07d5f66179 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -26,16 +26,10 @@
 #include <iomanip>
 
 #include <boost/algorithm/string.hpp>
-#include <boost/range/algorithm/find.hpp>
-#include <boost/range/algorithm/count_if.hpp>
+#include <boost/tokenizer.hpp>
 #include <boost/range/algorithm/transform.hpp>
 #include <boost/range/algorithm/sort.hpp>
 #include <boost/range/irange.hpp>
-#include <boost/tokenizer.hpp>
-#include <boost/variant.hpp>
-
-#include <boost/spirit/include/phoenix_core.hpp>
-#include <boost/spirit/include/phoenix_operator.hpp>
 
 #include <opencog/util/dorepeat.h>
 #include <opencog/util/exceptions.h>
@@ -53,10 +47,6 @@
 
 using namespace opencog;
 
-using namespace boost;
-using namespace boost::phoenix;
-using boost::phoenix::arg_names::arg1;
-
 // -------------------------------------------------------
 
 /**
@@ -231,6 +221,8 @@ infer_type_from_token2(Type curr_guess, const std::string& token)
 }
 
 // ===========================================================
+#ifdef NOT_USED_ANYWHERE
+
 // istream regular tables.
 static const char *sparse_delim = " : ";
 
@@ -275,14 +267,14 @@ std::istream& istreamRawITable(std::istream& in, ITable& tab,
 		lines.push_back(line);
 
 	// Determine the arity from the first line.
-	std::vector<std::string> fl = tokenizeRow<std::string>(lines[0], ignored_indices);
+	std::vector<std::string> fl = tokenizeRow(lines[0]);
 	size_t arity = fl.size();
 
 	std::atomic<int> arity_fail_row(-1);
 	auto parse_line = [&](size_t i)
 	{
 		// tokenize the line and fill the table with
-		tab[i] = tokenizeRow<std::string>(lines[i], ignored_indices);
+		tab[i] = tokenizeRow(lines[i]);
 
 		// Check arity
 		if (arity != tab[i].size())
@@ -306,6 +298,7 @@ std::istream& istreamRawITable(std::istream& in, ITable& tab,
 	}
 	return in;
 }
+#endif // NOT_USED_ANYWHERE
 
 // ===========================================================
 
@@ -357,7 +350,7 @@ std::vector<std::string> get_header(const std::string& file_name)
 	std::ifstream in(file_name.c_str());
 	std::string line;
 	get_data_line(in, line);
-	return tokenizeRow<std::string>(line);
+	return tokenizeRow(line);
 }
 
 // ==================================================================
@@ -402,8 +395,7 @@ inferTableAttributes(std::istream& in,
 		lines.push_back(line);
 
 	// Parse what could be a header
-	const std::vector<std::string> maybe_header =
-		tokenizeRow<std::string>(lines.front());
+	const std::vector<std::string> maybe_header = tokenizeRow(lines.front());
 
 	// Determine arity
 	size_t arity = maybe_header.size();
@@ -417,7 +409,7 @@ inferTableAttributes(std::istream& in,
 	for (size_t i = 1; i < lines.size(); ++i)
 	{
 		// Parse line
-		const string_seq& tokens = tokenizeRow<std::string>(lines[i]);
+		const string_seq& tokens = tokenizeRow(lines[i]);
 
 		// Check arity
 		if (arity != tokens.size())
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index d03425b8a8..7732ca3826 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -32,22 +32,12 @@
 #include <vector>
 
 #include <boost/algorithm/string.hpp>
-#include <boost/range/algorithm/count.hpp>
-#include <boost/range/algorithm/binary_search.hpp>
-#include <boost/range/algorithm_ext/for_each.hpp>
 #include <boost/tokenizer.hpp>
 
 #include <opencog/atoms/value/Value.h>
 
 namespace opencog {
 
-/**
- * Convert strings to typed values
- */
-ValuePtr token_to_vertex(Type, const std::string&);
-
-// ===========================================================
-
 typedef boost::tokenizer<boost::escaped_list_separator<char>> table_tokenizer;
 
 /**
@@ -59,28 +49,23 @@ table_tokenizer get_row_tokenizer(const std::string& line);
 /**
  * Take a line and return a vector containing the elements parsed.
  */
-template<typename T>
-static std::vector<T> tokenizeRow (
-    const std::string& line,
-    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>())
+static std::vector<std::string> tokenizeRow (const std::string& line)
 {
-    table_tokenizer tok = get_row_tokenizer(line);
-    std::vector<T> res;
-    unsigned i = 0;
-    for (const std::string& t : tok) {
-
-        // trim away whitespace padding; failing to do this
-        // confuses stuff downstream.
-        std::string clean(t);
-        boost::trim(clean);
-
-        // Sometimes the tokenizer returns pure whitespace :-(
-        if (0 == clean.size()) continue;
-
-        if (!boost::binary_search(ignored_indices, i++))
-            res.push_back(boost::lexical_cast<T>(clean));
-    }
-    return res;
+	table_tokenizer tok = get_row_tokenizer(line);
+	std::vector<std::string> res;
+	for (const std::string& t : tok)
+	{
+		// Trim away whitespace padding; failing to do this
+		// confuses stuff downstream.
+		std::string clean(t);
+		boost::trim(clean);
+
+		// Sometimes the tokenizer returns pure whitespace :-(
+		if (0 == clean.size()) continue;
+
+		res.push_back(clean);
+	}
+	return res;
 }
 
 // ===========================================================

From 601a22685753fccee562006964198ca2d69b3cb4 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 21:52:11 +0300
Subject: [PATCH 28/56] More cleanup

---
 opencog/persist/csv/table_read.cc | 12 ++----------
 opencog/persist/csv/table_read.h  | 12 +++---------
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 07d5f66179..434374afb3 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -50,7 +50,7 @@ using namespace opencog;
 // -------------------------------------------------------
 
 /**
- * remove the carriage return (for DOS format)
+ * Remove the carriage return (for DOS format).
  */
 static void removeCarriageReturn(std::string& str)
 {
@@ -60,7 +60,7 @@ static void removeCarriageReturn(std::string& str)
 }
 
 /**
- * remove non ASCII char at the begining of the string
+ * Remove non ASCII char at the begining of the string.
  */
 static void removeNonASCII(std::string& str)
 {
@@ -345,14 +345,6 @@ is_header(const string_seq& tokens, const std::vector<Type>& col_types)
 	return false;
 }
 
-std::vector<std::string> get_header(const std::string& file_name)
-{
-	std::ifstream in(file_name.c_str());
-	std::string line;
-	get_data_line(in, line);
-	return tokenizeRow(line);
-}
-
 // ==================================================================
 
 /**
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index 7732ca3826..dd5b64bd14 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -80,15 +80,9 @@ typedef std::vector<ValuePtr> Table;
 
 // ===========================================================
 
-// Get the header of a DSV file (assuming there is one)
-string_seq get_header(const std::string& input_file);
-
-std::istream& istreamRawITable(
-    std::istream& in, ITable& table,
-    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>());
-
-std::istream& istreamITable(std::istream& in, ITable& tab,
-                           const string_seq& ignore_features);
+//std::istream& istreamRawITable(
+//    std::istream& in, ITable& table,
+//    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>());
 
 std::istream& istreamTable(const Handle&,
                            std::istream&,

From aab0dd5c6e3215267fc59cdac026eb92a48bc2dc Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 22:14:38 +0300
Subject: [PATCH 29/56] Start passing column names in

---
 opencog/persist/csv/table_read.cc | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 434374afb3..ff0ded4c2a 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -370,6 +370,7 @@ inferTableAttributes(std::istream& in,
                      const std::vector<std::string>& ignore_features,
                      std::vector<unsigned>& ignore_idxs,
                      std::vector<Type>& tt,
+                     std::vector<std::string>& maybe_header,
                      bool& has_header)
 {
 	has_header = false;
@@ -383,11 +384,11 @@ inferTableAttributes(std::istream& in,
 	// Get a portion of the dataset into memory (cleaning weird stuff)
 	std::vector<std::string> lines;
 	std::string line;
-	while (get_data_line(in, line) && maxline-- > 0)
+	while (get_data_line(in, line) and 0 < maxline--)
 		lines.push_back(line);
 
 	// Parse what could be a header
-	const std::vector<std::string> maybe_header = tokenizeRow(lines.front());
+	maybe_header = tokenizeRow(lines.front());
 
 	// Determine arity
 	size_t arity = maybe_header.size();
@@ -467,13 +468,14 @@ istreamDenseTable(const Handle& anchor,
                   std::istream& in,
                   const std::vector<unsigned>& ignore_idxs,
                   const std::vector<Type>& col_types,
+                  const std::vector<std::string>& header,
                   bool has_header)
 {
 	// Width of table in the input.
 	size_t table_width = col_types.size();
 
 	// Effective width is the width, without the ignored columns.
-	size_t effective_width = table_width - ignore_idxs.size();
+	// size_t effective_width = table_width - ignore_idxs.size();
 
 	// Setup a mask; should we skip the column?
 	std::vector<bool> skip_col(table_width, false);
@@ -571,14 +573,25 @@ opencog::istreamTable(const Handle& anchor,
 	std::streampos beg = in.tellg();
 
 	// Infer the properties of the table without loading its content
-	bool has_header = false;
 	std::vector<unsigned> ignore_indexes;
 	std::vector<Type> col_types;
+	std::vector<std::string> header;
+	bool has_header = false;
 	inferTableAttributes(in, ignore_features, ignore_indexes,
-	                     col_types, has_header);
+	                     col_types, header, has_header);
+
+	// If the header is missing, then fake it.
+	if (not has_header)
+	{
+		header.clear();
+		for (size_t i=0; i<col_types.size(); i++)
+			header.push_back("c" + std::to_string(i));
+	}
+
 	in.seekg(beg);
 
-	return istreamDenseTable(anchor, in, ignore_indexes, col_types, has_header);
+	return istreamDenseTable(anchor, in, ignore_indexes,
+		col_types, header, has_header);
 }
 
 // ==================================================================

From ae3db2807a704cf8be89108d253ecc66c385dd67 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 22:28:24 +0300
Subject: [PATCH 30/56] Start placing the values on the anchor

---
 opencog/persist/csv/table_read.cc | 32 ++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index ff0ded4c2a..d7af4015cf 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -37,7 +37,7 @@
 #include <opencog/util/oc_omp.h>
 #include <opencog/util/comprehension.h>
 
-#include <opencog/atoms/base/Handle.h>
+#include <opencog/atomspace/AtomSpace.h>
 #include <opencog/atoms/value/BoolValue.h>
 #include <opencog/atoms/value/FloatValue.h>
 #include <opencog/atoms/value/StringValue.h>
@@ -550,6 +550,36 @@ istreamDenseTable(const Handle& anchor,
 		}
 	}
 
+	// Now that we've read everything in,
+	// place the individual columns into the anchor atom.
+	AtomSpace* as = anchor->getAtomSpace();
+	size_t bc = 0;
+	size_t fc = 0;
+	size_t sc = 0;
+	for (size_t ic = 0; ic < table_width; ic++)
+	{
+		if (skip_col[ic]) { ic++; continue; }
+		if (BOOL_VALUE == col_types[ic])
+		{
+			Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic]));
+			ValuePtr bvp = createBoolValue(bool_cols[bc]);
+			as->set_value(anchor, key, bvp);
+			bc ++;
+			ic ++;
+		}
+#if 0
+		else
+		if (FLOAT_VALUE == col_types[ic])
+			float_cols.push_back(std::vector<double>());
+		else
+		if (STRING_VALUE == col_types[ic])
+			string_cols.push_back(std::vector<std::string>());
+		else
+#endif
+		throw RuntimeException(TRACE_INFO,
+			"Unhandled column type");
+	}
+
 	return in;
 }
 

From d867b1e8f86d5791edae3b058bef0ffaa8dde486 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 22:35:56 +0300
Subject: [PATCH 31/56] Handle the other kinds of columns

---
 opencog/persist/csv/table_read.cc | 33 +++++++++++++++----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index d7af4015cf..27d32230ec 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -559,25 +559,24 @@ istreamDenseTable(const Handle& anchor,
 	for (size_t ic = 0; ic < table_width; ic++)
 	{
 		if (skip_col[ic]) { ic++; continue; }
+
+		ValuePtr vp;
 		if (BOOL_VALUE == col_types[ic])
-		{
-			Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic]));
-			ValuePtr bvp = createBoolValue(bool_cols[bc]);
-			as->set_value(anchor, key, bvp);
-			bc ++;
-			ic ++;
-		}
-#if 0
-		else
-		if (FLOAT_VALUE == col_types[ic])
-			float_cols.push_back(std::vector<double>());
-		else
-		if (STRING_VALUE == col_types[ic])
-			string_cols.push_back(std::vector<std::string>());
+			vp = createBoolValue(bool_cols[bc++]);
+
+		else if (FLOAT_VALUE == col_types[ic])
+			vp = createFloatValue(float_cols[fc++]);
+
+		else if (STRING_VALUE == col_types[ic])
+			vp = createStringValue(string_cols[sc++]);
+
 		else
-#endif
-		throw RuntimeException(TRACE_INFO,
-			"Unhandled column type");
+			throw RuntimeException(TRACE_INFO,
+				"Unhandled column type");
+
+		Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic]));
+		as->set_value(anchor, key, vp);
+		ic ++;
 	}
 
 	return in;

From 5bc0176287aecf9139ff937b9fa3dfe30db74e21 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 22:41:23 +0300
Subject: [PATCH 32/56] Add the list of keys to a well-known location

---
 opencog/persist/csv/table_read.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 27d32230ec..a9c82fb557 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -40,6 +40,7 @@
 #include <opencog/atomspace/AtomSpace.h>
 #include <opencog/atoms/value/BoolValue.h>
 #include <opencog/atoms/value/FloatValue.h>
+#include <opencog/atoms/value/LinkValue.h>
 #include <opencog/atoms/value/StringValue.h>
 #include <opencog/atoms/value/VoidValue.h>
 
@@ -556,6 +557,7 @@ istreamDenseTable(const Handle& anchor,
 	size_t bc = 0;
 	size_t fc = 0;
 	size_t sc = 0;
+	HandleSeq keylist;
 	for (size_t ic = 0; ic < table_width; ic++)
 	{
 		if (skip_col[ic]) { ic++; continue; }
@@ -576,9 +578,16 @@ istreamDenseTable(const Handle& anchor,
 
 		Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic]));
 		as->set_value(anchor, key, vp);
+		keylist.push_back(key);
 		ic ++;
 	}
 
+	// And finally, place a list of all the keys in a well-known
+	// location.
+	Handle klp = as->add_node(PREDICATE_NODE, std::string("*-column-keys-*"));
+	ValuePtr kvp = createLinkValue(keylist);
+	as->set_value(anchor, klp, kvp);
+
 	return in;
 }
 

From 20edb093cf1f0d93242fb96cc9167b1f0bd9f2f6 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 22:43:33 +0300
Subject: [PATCH 33/56] More header cleanup

---
 opencog/persist/csv/table_read.cc | 2 ++
 opencog/persist/csv/table_read.h  | 5 -----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index a9c82fb557..59bd87468e 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -227,6 +227,8 @@ infer_type_from_token2(Type curr_guess, const std::string& token)
 // istream regular tables.
 static const char *sparse_delim = " : ";
 
+typedef std::vector<string_seq> ITable;
+
 /**
  * Fill the input table, given a file in DSV (delimiter-seperated values)
  * format.  The delimiters are ',', ' ' or '\t'.
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index dd5b64bd14..eaf6e208d7 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -73,11 +73,6 @@ static std::vector<std::string> tokenizeRow (const std::string& line)
 // TODO: Should this be a StringValue?
 typedef std::vector<std::string> string_seq;
 
-typedef std::vector<string_seq> ITable;
-
-// TODO Should this be a TableValue?
-typedef std::vector<ValuePtr> Table;
-
 // ===========================================================
 
 //std::istream& istreamRawITable(

From 36a4ccbbbcba4692dfa46b1b3b14360b3a8696e9 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 22:53:01 +0300
Subject: [PATCH 34/56] Move stuff from header to c file

---
 opencog/persist/csv/table_read.cc | 37 ++++++++++++++-----------------
 opencog/persist/csv/table_read.h  | 32 --------------------------
 2 files changed, 17 insertions(+), 52 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 59bd87468e..502a5827eb 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -112,11 +112,13 @@ std::istream& get_data_line(std::istream& is, std::string& line)
 
 // -------------------------------------------------------
 
+typedef boost::tokenizer<boost::escaped_list_separator<char>> table_tokenizer;
+
 /**
  * Take a row, return a tokenizer.  Tokenization uses the
  * separator characters comma, blank, tab (',', ' ' or '\t').
  */
-table_tokenizer opencog::get_row_tokenizer(const std::string& line)
+static table_tokenizer get_row_tokenizer(const std::string& line)
 {
 	typedef boost::escaped_list_separator<char> separator;
 	typedef boost::tokenizer<separator> tokenizer;
@@ -126,29 +128,24 @@ table_tokenizer opencog::get_row_tokenizer(const std::string& line)
 	return tokenizer(line, sep);
 }
 
-// Same as above, but only allow commas as a column separator.
-table_tokenizer get_sparse_row_tokenizer(const std::string& line)
-{
-	typedef boost::escaped_list_separator<char> separator;
-	typedef boost::tokenizer<separator> tokenizer;
-
-	// Tokenize line; currently, we allow tabs, commas, blanks.
-	static const separator sep("\\", ",", "\"");
-	return tokenizer(line, sep);
-}
-
 /**
  * Take a line and return a vector containing the elements parsed.
- * Used by istreamTable. This will modify the line to remove leading
- * non-ASCII characters, as well as stripping of any carriage-returns.
  */
-std::vector<std::string> tokenizeSparseRow(const std::string& line)
+static std::vector<std::string> tokenizeRow (const std::string& line)
 {
-	table_tokenizer tok = get_sparse_row_tokenizer(line);
+	table_tokenizer tok = get_row_tokenizer(line);
 	std::vector<std::string> res;
-	for (std::string t : tok) {
-		boost::trim(t);
-		res.push_back(t);
+	for (const std::string& t : tok)
+	{
+		// Trim away whitespace padding; failing to do this
+		// confuses stuff downstream.
+		std::string clean(t);
+		boost::trim(clean);
+
+		// Sometimes the tokenizer returns pure whitespace :-(
+		if (0 == clean.size()) continue;
+
+		res.push_back(clean);
 	}
 	return res;
 }
@@ -158,7 +155,7 @@ std::vector<std::string> tokenizeSparseRow(const std::string& line)
  * Given an input string, guess the type of the string.
  * Inferable types are: boolean, contin and enum.
  */
-Type infer_type_from_token(const std::string& token)
+static Type infer_type_from_token(const std::string& token)
 {
     /* Prefered representation is T's and 0's, to maximize clarity,
      * readability.  Numeric values are easily confused with floating
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index eaf6e208d7..fdc6782204 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -38,38 +38,6 @@
 
 namespace opencog {
 
-typedef boost::tokenizer<boost::escaped_list_separator<char>> table_tokenizer;
-
-/**
- * Take a row, return a tokenizer.  Tokenization uses the
- * separator characters comma, blank, tab (',', ' ' or '\t').
- */
-table_tokenizer get_row_tokenizer(const std::string& line);
-
-/**
- * Take a line and return a vector containing the elements parsed.
- */
-static std::vector<std::string> tokenizeRow (const std::string& line)
-{
-	table_tokenizer tok = get_row_tokenizer(line);
-	std::vector<std::string> res;
-	for (const std::string& t : tok)
-	{
-		// Trim away whitespace padding; failing to do this
-		// confuses stuff downstream.
-		std::string clean(t);
-		boost::trim(clean);
-
-		// Sometimes the tokenizer returns pure whitespace :-(
-		if (0 == clean.size()) continue;
-
-		res.push_back(clean);
-	}
-	return res;
-}
-
-// ===========================================================
-
 // TODO: Should this be a StringValue?
 typedef std::vector<std::string> string_seq;
 

From e98033808d4297eeb7b4cf547d8a9ae6a6000edf Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 23:01:05 +0300
Subject: [PATCH 35/56] Add documentation

---
 opencog/persist/csv/table_read.cc | 42 ++++++++++++++++++++++++++++---
 opencog/persist/csv/table_read.h  | 12 +++------
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 502a5827eb..a464e4727b 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -633,9 +633,45 @@ opencog::istreamTable(const Handle& anchor,
 
 // ==================================================================
 
-void loadTable(const Handle& anchor,
-               const std::string& file_name,
-               const string_seq& ignore_features)
+/**
+ * Load columns from a CSV file and place them into Atomese Values on
+ * the indicated Atom. Atomese Values are vectors (of floats, bools,
+ * srings, or more complex structures). Each Value holds one column
+ * from the dataset.
+ *
+ * The features (columns) specified in ignore_features will be omitted
+ * from the representation.
+ *
+ * For example, a CSV dataset like this:
+ *    o, i1, i2, i3, i4
+ *    1, 0, 0, 3.3, "foo"
+ *    0, 1, 0, 4.4, "bar"
+ *
+ * will be loaded as key-value pairs on the `anchor` Atom.
+ *
+ * First, at the "well known location"
+ *    (Predicate "*-column-keys-*")
+ * there will be a list of all of the column-keys in the table:
+ *    (LinkValue
+ *       (Predicate "o")
+ *       (Predicate "i1")
+ *       (Predicate "i2")
+ *       (Predicate "i3")
+ *       (Predicate "i4"))
+ *
+ * Next, under each key, there will a column of values:
+ *    (Predicate "o") (BoolValue 1 0)
+ *    (Predicate "i1") (BoolValue 0 1)
+ *    (Predicate "i2") (BoolValue 0 0)
+ *    (Predicate "i3") (FloatValue 3.3 4.4)
+ *    (Predicate "i4") (StringValue "foo" "bar")
+ *
+ * @param file_name
+ * @param ignore_features
+ */
+void load_cvs_table(const Handle& anchor,
+                    const std::string& file_name,
+                    const string_seq& ignore_features)
 {
 	if (file_name.empty())
 		throw RuntimeException(TRACE_INFO, "The file name is empty!");
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index fdc6782204..d83241a2b3 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -31,9 +31,6 @@
 #include <string>
 #include <vector>
 
-#include <boost/algorithm/string.hpp>
-#include <boost/tokenizer.hpp>
-
 #include <opencog/atoms/value/Value.h>
 
 namespace opencog {
@@ -41,20 +38,19 @@ namespace opencog {
 // TODO: Should this be a StringValue?
 typedef std::vector<std::string> string_seq;
 
-// ===========================================================
+void load_csv_table(const Handle& anchor,
+                    const std::string& file_name,
+                    const string_seq& ignore_features=string_seq());
 
 //std::istream& istreamRawITable(
 //    std::istream& in, ITable& table,
 //    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>());
 
+// Same as above, but works for an already-open stream.
 std::istream& istreamTable(const Handle&,
                            std::istream&,
                            const string_seq& ignore_features);
 
-void loadTable(const Handle& anchor,
-               const std::string& file_name,
-               const string_seq& ignore_features=string_seq());
-
 } // ~namespaces opencog
 
 #endif // _ATOMESE_TABLE_READ_H

From 44869c1050e64ec9ad67f24acb4c6825dd83eb97 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 23:02:43 +0300
Subject: [PATCH 36/56] Remove un-needed files

---
 opencog/persist/csv/CMakeLists.txt |  3 +-
 opencog/persist/csv/load_csv.cc    | 60 ------------------------------
 opencog/persist/csv/load_csv.h     | 41 --------------------
 3 files changed, 1 insertion(+), 103 deletions(-)
 delete mode 100644 opencog/persist/csv/load_csv.cc
 delete mode 100644 opencog/persist/csv/load_csv.h

diff --git a/opencog/persist/csv/CMakeLists.txt b/opencog/persist/csv/CMakeLists.txt
index d358336b58..739333f0ff 100644
--- a/opencog/persist/csv/CMakeLists.txt
+++ b/opencog/persist/csv/CMakeLists.txt
@@ -1,7 +1,6 @@
 
 # Generic JSON decoding.
 ADD_LIBRARY (csv
-	load_csv.cc
 	table_read.cc
 )
 
@@ -18,7 +17,7 @@ INSTALL (TARGETS csv EXPORT AtomSpaceTargets
 )
 
 INSTALL (FILES
-	load_csv.h
+	table_read.h
 	DESTINATION "include/opencog/persist/csv"
 )
 
diff --git a/opencog/persist/csv/load_csv.cc b/opencog/persist/csv/load_csv.cc
deleted file mode 100644
index 6c55f5f000..0000000000
--- a/opencog/persist/csv/load_csv.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * load_csv.cc -- Load CSV tables into Values
- *
- * Copyright (C) 2022 Linas Vepstas
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License v3 as
- * published by the Free Software Foundation and including the exceptions
- * at http://opencog.org/wiki/Licenses
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program; if not, write to:
- * Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#include <string>
-
-#include <opencog/atomspace/AtomSpace.h>
-#include "load_csv.h"
-
-using namespace opencog;
-
-/**
- * Load columns from a CSV file and place them into Atomese Values on
- * the indicated Atom. Atomese Values are vectors (of floats, bools,
- * srings, or more complex structures). Each Value holds one column
- * from the dataset.
- *
- * The features (columns) specified in ignore_features will be omitted
- * from the representation.
- *
- * For example, a CSV dataset like this:
- * o, i1, i2, i3, i4
- * 1, 0, 0, 3.3, "foo"
- * 0, 1, 0, 4.4, "bar"
- *
- * will be loaded as the following key-value pairs on the `anchor` Atom:
- * (Predicate "*-column-names-*") (StringValue "o", "i1", "i2", "i3", "i4")
- * (Predicate "o") (BoolValue 1 0)
- * (Predicate "i1") (BoolValue 0 1)
- * (Predicate "i2") (BoolValue 0 0)
- * (Predicate "i3") (FloatValue 3.3 4.4)
- * (Predicate "i4") (StringValue "foo" "bar")
- *
- * @param file_name
- * @param ignore_features
- * @return
- */
-void load_csv_table(
-	const Handle& anchor,
-	const std::string& file_name,
-	const std::vector<std::string>& ignore_features)
-{
-}
diff --git a/opencog/persist/csv/load_csv.h b/opencog/persist/csv/load_csv.h
deleted file mode 100644
index f073d6336b..0000000000
--- a/opencog/persist/csv/load_csv.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * load_csv.h -- Load CSV tables into Values
- *
- * Copyright (C) 2018 OpenCog Foundation
- * Copyright (C) 2022 Linas Vepstas
- *
- * Author: Yidnekachew Wondimu <searchyidne@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License v3 as
- * published by the Free Software Foundation and including the exceptions
- * at http://opencog.org/wiki/Licenses
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program; if not, write to:
- * Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#ifndef _ATOMESE_LOAD_CSV_H
-#define _ATOMESE_LOAD_CSV_H
-
-#include <opencog/atomspace/AtomSpace.h>
-
-namespace opencog {
-
-// Load columns from a CSV file and place them into Atomese Values on
-// the indicated Atom. See the .cc file for additional info.
-void load_csv_table(
-	const Handle& anchor,
-	const std::string& file_name,
-	const std::vector<std::string>& ignore_features=std::vector<std::string>());
-
-} // end namespace opencog
-
-#endif //_ATOMESE_LOAD_CSV_H

From 829d9340197a24ab3e8acd6aebe83f4b9f02707e Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 23:10:23 +0300
Subject: [PATCH 37/56] Move documentation around

---
 opencog/persist/csv/table_read.cc | 52 ++++++++-----------------------
 opencog/persist/csv/table_read.h  | 36 +++++++++++++++++++++
 2 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index a464e4727b..4778abba58 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -462,7 +462,9 @@ static double token_to_contin(const std::string& token)
 	}
 }
 
-
+// See header file for `load_csv_table` for a general description
+// of what is being done here.  In breif, columns from a table
+// are jammed into individual values on a given atom.
 static std::istream&
 istreamDenseTable(const Handle& anchor,
                   std::istream& in,
@@ -482,7 +484,7 @@ istreamDenseTable(const Handle& anchor,
 	for (unsigned i : ignore_idxs)
 		skip_col[i] = true;
 
-	// Set up typed columns.
+	// Set up typed columns.  They're empty at first.
 	std::vector<std::vector<bool>> bool_cols;
 	std::vector<std::vector<double>> float_cols;
 	std::vector<std::vector<std::string>> string_cols;
@@ -503,6 +505,7 @@ istreamDenseTable(const Handle& anchor,
 				"Unhandled column type");
 	}
 
+	// ----------------------------------------------
 	std::string line;
 
 	// Assume the stream is at the begining.
@@ -511,6 +514,8 @@ istreamDenseTable(const Handle& anchor,
 		get_data_line(in, line);
 
 	// Loop over all lines in the table, one by one.
+	// Stuff the desired columns into each of the columns
+	// we created above.
 	while (get_data_line(in, line))
 	{
 		table_tokenizer toker = get_row_tokenizer(line);
@@ -551,7 +556,11 @@ istreamDenseTable(const Handle& anchor,
 	}
 
 	// Now that we've read everything in,
-	// place the individual columns into the anchor atom.
+	// place the individual columns into Values,
+	// and then each value under's its column name,
+	// all of these on the anchor atom.
+
+	// XXX TODO, we should probably take AtomSpace as an argument!?
 	AtomSpace* as = anchor->getAtomSpace();
 	size_t bc = 0;
 	size_t fc = 0;
@@ -633,42 +642,7 @@ opencog::istreamTable(const Handle& anchor,
 
 // ==================================================================
 
-/**
- * Load columns from a CSV file and place them into Atomese Values on
- * the indicated Atom. Atomese Values are vectors (of floats, bools,
- * srings, or more complex structures). Each Value holds one column
- * from the dataset.
- *
- * The features (columns) specified in ignore_features will be omitted
- * from the representation.
- *
- * For example, a CSV dataset like this:
- *    o, i1, i2, i3, i4
- *    1, 0, 0, 3.3, "foo"
- *    0, 1, 0, 4.4, "bar"
- *
- * will be loaded as key-value pairs on the `anchor` Atom.
- *
- * First, at the "well known location"
- *    (Predicate "*-column-keys-*")
- * there will be a list of all of the column-keys in the table:
- *    (LinkValue
- *       (Predicate "o")
- *       (Predicate "i1")
- *       (Predicate "i2")
- *       (Predicate "i3")
- *       (Predicate "i4"))
- *
- * Next, under each key, there will a column of values:
- *    (Predicate "o") (BoolValue 1 0)
- *    (Predicate "i1") (BoolValue 0 1)
- *    (Predicate "i2") (BoolValue 0 0)
- *    (Predicate "i3") (FloatValue 3.3 4.4)
- *    (Predicate "i4") (StringValue "foo" "bar")
- *
- * @param file_name
- * @param ignore_features
- */
+// See header file for general description.
 void load_cvs_table(const Handle& anchor,
                     const std::string& file_name,
                     const string_seq& ignore_features)
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index d83241a2b3..c16d4fc2cb 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -38,6 +38,42 @@ namespace opencog {
 // TODO: Should this be a StringValue?
 typedef std::vector<std::string> string_seq;
 
+/**
+ * Load columns from a CSV file and place them into Atomese Values on
+ * the indicated Atom. Atomese Values are vectors (of floats, bools,
+ * srings, or more complex structures). Each Value holds one column
+ * from the dataset.
+ *
+ * The features (columns) specified in ignore_features will be omitted
+ * from the representation.
+ *
+ * For example, a CSV dataset like this:
+ *    o, i1, i2, i3, i4
+ *    1, 0, 0, 3.3, "foo"
+ *    0, 1, 0, 4.4, "bar"
+ *
+ * will be loaded as key-value pairs on the `anchor` Atom.
+ *
+ * First, at the "well known location"
+ *    (Predicate "*-column-keys-*")
+ * there will be a list of all of the column-keys in the table:
+ *    (LinkValue
+ *       (Predicate "o")
+ *       (Predicate "i1")
+ *       (Predicate "i2")
+ *       (Predicate "i3")
+ *       (Predicate "i4"))
+ *
+ * Next, under each key, there will a column of values:
+ *    (Predicate "o") (BoolValue 1 0)
+ *    (Predicate "i1") (BoolValue 0 1)
+ *    (Predicate "i2") (BoolValue 0 0)
+ *    (Predicate "i3") (FloatValue 3.3 4.4)
+ *    (Predicate "i4") (StringValue "foo" "bar")
+ *
+ * @param file_name
+ * @param ignore_features
+ */
 void load_csv_table(const Handle& anchor,
                     const std::string& file_name,
                     const string_seq& ignore_features=string_seq());

From c15a912a76764eff4f44688bad57270d838e63e7 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 23:26:31 +0300
Subject: [PATCH 38/56] Add a README to explain what is going on

---
 opencog/persist/README.md        |  3 +-
 opencog/persist/csv/README.md    | 50 ++++++++++++++++++++++++++++++++
 opencog/persist/csv/table_read.h |  2 +-
 3 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 opencog/persist/csv/README.md

diff --git a/opencog/persist/README.md b/opencog/persist/README.md
index 81de0b5ef6..1e51b39928 100644
--- a/opencog/persist/README.md
+++ b/opencog/persist/README.md
@@ -17,7 +17,8 @@ Local subdirectories include:
               for RocksDB and one that allows AtomSpaces to trade
               Atoms over the network.)
 
-* csv      -- Load Values from CSV/TSV files. Each column in the CSV
+* csv      -- Load Values from CSV/TSV files. These are "delimiter
+              separated values" -- ordinary tables. Each column in the
               table is loaded into an appropriate Value (`FloatValue`,
               `BoolValue` or `StringValue`). The values are placed
               under keys (named after the column) on the provided Atom.
diff --git a/opencog/persist/csv/README.md b/opencog/persist/csv/README.md
new file mode 100644
index 0000000000..87a4f06ed7
--- /dev/null
+++ b/opencog/persist/csv/README.md
@@ -0,0 +1,50 @@
+
+Load Ordinary CSV Tables
+========================
+The code here is able to load "delimiter-separated values" (DSV,
+or CSV, TSV for comma and tab separators) from a file. This are
+just very conventional tables.
+
+Each column from a DSV file is read in and placed into an Atomese
+Values on an indicated Atom. Atomese Values are vectors (of floats,
+bools, strings). Each Value holds one column from the dataset.
+
+Basically, this just gets CSV data into the AtomSpace, where it
+becomes easy for Atomese programs to act on them, i.e. to use them
+as input for some kind of data stream processing.
+
+The features (columns) specified in ignore_features will be omitted
+from the representation.
+
+Example
+-------
+For example, a CSV dataset like this:
+```
+   o, i1, i2, i3, i4
+   1, 0, 0, 3.3, "foo"
+   0, 1, 0, 4.4, "bar"
+```
+will be loaded as key-value pairs on the `anchor` Atom.
+
+The column names will be loaded under a "well known key":
+```
+   (Predicate "*-column-keys-*")
+```
+This key will point at a value holding a list of all of the
+column-keys in the table:
+```
+   (LinkValue
+      (Predicate "o")
+      (Predicate "i1")
+      (Predicate "i2")
+      (Predicate "i3")
+      (Predicate "i4"))
+```
+Then, under each key, there will a column of values:
+```
+   (Predicate "o") (BoolValue 1 0)
+   (Predicate "i1") (BoolValue 0 1)
+   (Predicate "i2") (BoolValue 0 0)
+   (Predicate "i3") (FloatValue 3.3 4.4)
+   (Predicate "i4") (StringValue "foo" "bar")
+```
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index c16d4fc2cb..4df8031530 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -41,7 +41,7 @@ typedef std::vector<std::string> string_seq;
 /**
  * Load columns from a CSV file and place them into Atomese Values on
  * the indicated Atom. Atomese Values are vectors (of floats, bools,
- * srings, or more complex structures). Each Value holds one column
+ * strings, or more complex structures). Each Value holds one column
  * from the dataset.
  *
  * The features (columns) specified in ignore_features will be omitted

From 7c8256c7b13a68fae9f8b355dca4ff911cbeb2f0 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 23:46:24 +0300
Subject: [PATCH 39/56] Start work on a unit test for CSV

---
 opencog/persist/csv/table_read.cc      |  2 +-
 tests/persist/CMakeLists.txt           |  1 +
 tests/persist/csv/CMakeLists.txt       |  3 ++
 tests/persist/csv/CSVLoadUTest.cxxtest | 65 ++++++++++++++++++++++++++
 tests/persist/csv/simple.csv           | 10 ++++
 5 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 tests/persist/csv/CMakeLists.txt
 create mode 100644 tests/persist/csv/CSVLoadUTest.cxxtest
 create mode 100644 tests/persist/csv/simple.csv

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 4778abba58..6f200f5485 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -592,7 +592,7 @@ istreamDenseTable(const Handle& anchor,
 
 	// And finally, place a list of all the keys in a well-known
 	// location.
-	Handle klp = as->add_node(PREDICATE_NODE, std::string("*-column-keys-*"));
+	Handle klp = as->add_node(PREDICATE_NODE, "*-column-keys-*");
 	ValuePtr kvp = createLinkValue(keylist);
 	as->set_value(anchor, klp, kvp);
 
diff --git a/tests/persist/CMakeLists.txt b/tests/persist/CMakeLists.txt
index f97cfcbe3f..eff8ffb3fb 100644
--- a/tests/persist/CMakeLists.txt
+++ b/tests/persist/CMakeLists.txt
@@ -1,3 +1,4 @@
+ADD_SUBDIRECTORY (csv)
 ADD_SUBDIRECTORY (sexpr)
 ADD_SUBDIRECTORY (sql)
 ADD_SUBDIRECTORY (tlb)
diff --git a/tests/persist/csv/CMakeLists.txt b/tests/persist/csv/CMakeLists.txt
new file mode 100644
index 0000000000..ae0ac79321
--- /dev/null
+++ b/tests/persist/csv/CMakeLists.txt
@@ -0,0 +1,3 @@
+LINK_LIBRARIES(atomspace csv)
+
+ADD_CXXTEST(CSVLoadUTest)
diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest
new file mode 100644
index 0000000000..e4981364ae
--- /dev/null
+++ b/tests/persist/csv/CSVLoadUTest.cxxtest
@@ -0,0 +1,65 @@
+/*
+ * CSVLoadUTest.cxxtest
+ *
+ * Copyright (c) 2022 Linas Vepstas
+ * SPDX-License-Identifier: AGPL-3.0-or-later
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License v3 as
+ * published by the Free Software Foundation and including the exceptions
+ * at http://opencog.org/wiki/Licenses
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+#include <opencog/atomspace/AtomSpace.h>
+#include <opencog/atoms/value/LinkValue.h>
+#include <opencog/persist/csv/table_read.h>
+
+using namespace opencog;
+
+class CSVLoadUTest : public CxxTest::TestSuite {
+
+private:
+	AtomSpacePtr _asp;
+
+public:
+	CSVLoadUTest() {
+		logger().set_print_to_stdout_flag(true);
+		_asp = createAtomSpace();
+	}
+
+	void setUp() { _asp->clear(); }
+
+	void tearDown() {}
+
+	void test_simple_load();
+};
+
+// Test load_csv_table
+void CSVLoadUTest::test_simple_load()
+{
+	logger().info("BEGIN TEST: %s", __FUNCTION__);
+
+	Handle h = _asp->add_node(CONCEPT_NODE, "foo");
+
+	load_csv_table(h, "simple.csv");
+
+	// There's the five columns, plus the table of contents.
+	HandleSet keys = h->getKeys();
+	TS_ASSERT_EQUALS(6, keys.size());
+
+	Handle colkey = _asp->add_node(PREDICATE_NODE, "*-column-keys-*");
+	ValuePtr kvp = h->getValue(colkey);
+
+	logger().info("END TEST: %s", __FUNCTION__);
+}
diff --git a/tests/persist/csv/simple.csv b/tests/persist/csv/simple.csv
new file mode 100644
index 0000000000..da3ab5c488
--- /dev/null
+++ b/tests/persist/csv/simple.csv
@@ -0,0 +1,10 @@
+#
+# This is a simple demo CSV file
+# It contains some comments, a column header
+# and some data.
+#
+o, i1, i2, i3, i4
+
+# Above was the column headers.  Now the data.
+   1, 0, 0, 3.3, "foo"
+   0, 1, 0, 4.4, "bar"

From 8171c146996abaec5f7a1d288eb8b5e57f7529d8 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sat, 20 Aug 2022 23:56:00 +0300
Subject: [PATCH 40/56] Fix typo in the name

---
 opencog/persist/csv/table_read.cc | 6 +++---
 tests/persist/csv/CMakeLists.txt  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 6f200f5485..7046240654 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -643,9 +643,9 @@ opencog::istreamTable(const Handle& anchor,
 // ==================================================================
 
 // See header file for general description.
-void load_cvs_table(const Handle& anchor,
-                    const std::string& file_name,
-                    const string_seq& ignore_features)
+void opencog::load_csv_table(const Handle& anchor,
+                             const std::string& file_name,
+                             const string_seq& ignore_features)
 {
 	if (file_name.empty())
 		throw RuntimeException(TRACE_INFO, "The file name is empty!");
diff --git a/tests/persist/csv/CMakeLists.txt b/tests/persist/csv/CMakeLists.txt
index ae0ac79321..9c111c7d1d 100644
--- a/tests/persist/csv/CMakeLists.txt
+++ b/tests/persist/csv/CMakeLists.txt
@@ -1,3 +1,3 @@
-LINK_LIBRARIES(atomspace csv)
+LINK_LIBRARIES(csv atomspace)
 
 ADD_CXXTEST(CSVLoadUTest)

From a9c5b230c90d38af6c3076eb25a46ca5aeb325f3 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 00:17:22 +0300
Subject: [PATCH 41/56] Bug fix, failed to pass types along

---
 opencog/persist/csv/table_read.cc      | 4 ++--
 tests/persist/csv/CSVLoadUTest.cxxtest | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 7046240654..8429da0761 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -369,7 +369,7 @@ static std::istream&
 inferTableAttributes(std::istream& in,
                      const std::vector<std::string>& ignore_features,
                      std::vector<unsigned>& ignore_idxs,
-                     std::vector<Type>& tt,
+                     std::vector<Type>& types,
                      std::vector<std::string>& maybe_header,
                      bool& has_header)
 {
@@ -395,7 +395,7 @@ inferTableAttributes(std::istream& in,
 	std::atomic<int> arity_fail_row(-1);
 
 	// Determine initial type
-	std::vector<Type> types(arity, VOID_VALUE);
+	types.resize(arity, VOID_VALUE);
 
 	// Parse the rest, determine its type and whether the arity is
 	// consistent
diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest
index e4981364ae..480587d264 100644
--- a/tests/persist/csv/CSVLoadUTest.cxxtest
+++ b/tests/persist/csv/CSVLoadUTest.cxxtest
@@ -52,7 +52,8 @@ void CSVLoadUTest::test_simple_load()
 
 	Handle h = _asp->add_node(CONCEPT_NODE, "foo");
 
-	load_csv_table(h, "simple.csv");
+	// Argh. Ugly. Fix.
+	load_csv_table(h, "../tests/persist/csv/simple.csv");
 
 	// There's the five columns, plus the table of contents.
 	HandleSet keys = h->getKeys();

From 0a0f8a6b708e38189f968073b53377b9131930ce Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 00:22:52 +0300
Subject: [PATCH 42/56] nother bug fix

---
 opencog/persist/csv/table_read.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 8429da0761..6aebcb1dd0 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -518,12 +518,12 @@ istreamDenseTable(const Handle& anchor,
 	// we created above.
 	while (get_data_line(in, line))
 	{
-		table_tokenizer toker = get_row_tokenizer(line);
 		size_t ic = 0;
 		size_t bc = 0;
 		size_t fc = 0;
 		size_t sc = 0;
-		for (const std::string& tok : toker)
+		std::vector<std::string> toks = tokenizeRow(line);
+		for (const std::string& tok : toks)
 		{
 			if (skip_col[ic]) { ic++; continue; }
 			if (BOOL_VALUE == col_types[ic])

From eff2d58f6cb97f966c40c11e50aa3901a98ea681 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 00:30:42 +0300
Subject: [PATCH 43/56] Another bugfix

---
 opencog/persist/csv/table_read.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index 6aebcb1dd0..a111049343 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -568,7 +568,7 @@ istreamDenseTable(const Handle& anchor,
 	HandleSeq keylist;
 	for (size_t ic = 0; ic < table_width; ic++)
 	{
-		if (skip_col[ic]) { ic++; continue; }
+		if (skip_col[ic]) continue;
 
 		ValuePtr vp;
 		if (BOOL_VALUE == col_types[ic])
@@ -587,7 +587,6 @@ istreamDenseTable(const Handle& anchor,
 		Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic]));
 		as->set_value(anchor, key, vp);
 		keylist.push_back(key);
-		ic ++;
 	}
 
 	// And finally, place a list of all the keys in a well-known

From f9272b8bd857b67295fb8bf5c9b1d4fdd3aab6af Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 00:41:54 +0300
Subject: [PATCH 44/56] Expand teh unit test some more

---
 tests/persist/csv/CSVLoadUTest.cxxtest | 28 ++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest
index 480587d264..fa30e2d1b8 100644
--- a/tests/persist/csv/CSVLoadUTest.cxxtest
+++ b/tests/persist/csv/CSVLoadUTest.cxxtest
@@ -61,6 +61,34 @@ void CSVLoadUTest::test_simple_load()
 
 	Handle colkey = _asp->add_node(PREDICATE_NODE, "*-column-keys-*");
 	ValuePtr kvp = h->getValue(colkey);
+	TS_ASSERT_EQUALS(5, kvp->size());
+
+	// Loop over the columns
+	LinkValuePtr lvp = LinkValueCast(kvp);
+	HandleSeq keylist = lvp->to_handle_seq();
+	for (const Handle& key : keylist)
+	{
+		ValuePtr vp = h->getValue(key);
+		TS_ASSERT_EQUALS(2, vp->size());
+		printf("Column %s is %s\n", key->to_short_string().c_str(),
+			vp->to_string().c_str());
+	}
+
+	// Loop over columns again, verify types.
+	int bc = 0;
+	int fc = 0;
+	int sc = 0;
+	for (const Handle& key : keylist)
+	{
+		ValuePtr vp = h->getValue(key);
+		Type vt = vp->get_type();
+		if (BOOL_VALUE == vt) bc++;
+		if (FLOAT_VALUE == vt) fc++;
+		if (STRING_VALUE == vt) sc++;
+	}
+	TS_ASSERT_EQUALS(3, bc);
+	TS_ASSERT_EQUALS(1, fc);
+	TS_ASSERT_EQUALS(1, sc);
 
 	logger().info("END TEST: %s", __FUNCTION__);
 }

From be9286759112ed5330bac31c3544a3b19c16e6db Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 00:56:31 +0300
Subject: [PATCH 45/56] Add scheme bindings to the table loader

---
 opencog/persist/csv/CMakeLists.txt | 18 ++++++
 opencog/persist/csv/TableSCM.cc    | 88 ++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 opencog/persist/csv/TableSCM.cc

diff --git a/opencog/persist/csv/CMakeLists.txt b/opencog/persist/csv/CMakeLists.txt
index 739333f0ff..6cc3f7c35d 100644
--- a/opencog/persist/csv/CMakeLists.txt
+++ b/opencog/persist/csv/CMakeLists.txt
@@ -22,3 +22,21 @@ INSTALL (FILES
 )
 
 # -------------------------------
+
+ADD_LIBRARY (csv-table
+   TableSCM.cc
+)
+
+TARGET_LINK_LIBRARIES(csv-table
+	csv
+	atomspace
+	smob
+)
+
+ADD_GUILE_EXTENSION(SCM_CONFIG csv-table "opencog-ext-path-csv-table")
+
+INSTALL (TARGETS csv-table EXPORT AtomSpaceTargets
+   DESTINATION "lib${LIB_DIR_SUFFIX}/opencog"
+)
+
+# -------------------------------
diff --git a/opencog/persist/csv/TableSCM.cc b/opencog/persist/csv/TableSCM.cc
new file mode 100644
index 0000000000..489a89edb8
--- /dev/null
+++ b/opencog/persist/csv/TableSCM.cc
@@ -0,0 +1,88 @@
+/*
+ * opencog/persist/csv/TableSCM.cc
+ *
+ * Copyright (c) 2008 by OpenCog Foundation
+ * Copyright (c) 2008, 2009, 2013, 2015, 2022 Linas Vepstas <linasvepstas@gmail.com>
+ * All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License v3 as
+ * published by the Free Software Foundation and including the exceptions
+ * at http://opencog.org/wiki/Licenses
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program; if not, write to:
+ * Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _OPENCOG_CSV_TABLE_SCM_H
+#define _OPENCOG_CSV_TABLE_SCM_H
+
+#include <opencog/guile/SchemeModule.h>
+
+namespace opencog
+{
+/** \addtogroup grp_persist
+ *  @{
+ */
+
+class TableSCM : public ModuleWrap
+{
+private:
+	void init(void);
+
+	void load_table(const Handle&, const std::string&);
+public:
+	TableSCM(void);
+}; // class
+
+/** @}*/
+}  // namespace
+
+extern "C" {
+void opencog_persist_file_init(void);
+};
+
+#endif // _OPENCOG_CSV_TABLE_SCM_H
+
+#include <opencog/atomspace/AtomSpace.h>
+#include <opencog/guile/SchemePrimitive.h>
+
+#include "table_read.h"
+
+using namespace opencog;
+
+TableSCM::TableSCM(void)
+	: ModuleWrap("opencog csv-table")
+{
+	static bool is_init = false;
+	if (is_init) return;
+	is_init = true;
+	module_init();
+}
+
+// Temporary(?) Hacky experimental API.  Subject to change.
+void TableSCM::init(void)
+{
+	define_scheme_primitive("load-table",
+	             &TableSCM::load_table, this, "csv-table");
+}
+
+// =====================================================================
+
+void TableSCM::load_table(const Handle& h, const std::string& path)
+{
+	// const AtomSpacePtr& as = SchemeSmob::ss_get_env_as("load-table");
+	opencog::load_csv_table(h, path);
+}
+
+void opencog_persist_file_init(void)
+{
+	static TableSCM patty;
+}

From 2b491d5cec75913b60cd60694379e0355174e652 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 01:03:39 +0300
Subject: [PATCH 46/56] Add the scm side of the csv-table module

---
 opencog/scm/CMakeLists.txt        |  5 +++++
 opencog/scm/opencog/csv-table.scm | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 opencog/scm/opencog/csv-table.scm

diff --git a/opencog/scm/CMakeLists.txt b/opencog/scm/CMakeLists.txt
index 868a9bcbbf..cb86d04524 100644
--- a/opencog/scm/CMakeLists.txt
+++ b/opencog/scm/CMakeLists.txt
@@ -18,6 +18,11 @@ ADD_GUILE_MODULE (FILES
 
 # Each of the files below are distinct modules. They need to be
 # compiled seperately.
+ADD_GUILE_MODULE (FILES
+	opencog/csv-table.scm
+	COMPILE
+)
+
 ADD_GUILE_MODULE (FILES
 	opencog/exec.scm
 	DEPENDS exec
diff --git a/opencog/scm/opencog/csv-table.scm b/opencog/scm/opencog/csv-table.scm
new file mode 100644
index 0000000000..acbbea0afc
--- /dev/null
+++ b/opencog/scm/opencog/csv-table.scm
@@ -0,0 +1,23 @@
+;
+; OpenCog CSV Table Reader module
+;
+
+(define-module (opencog csv-table))
+
+(use-modules (opencog))
+(use-modules (opencog as-config))
+(load-extension
+	(string-append opencog-ext-path-csv-table "libcsv-table")
+	"opencog_csv_table_init")
+
+(export load-table)
+
+(set-procedure-property! load-table 'documentation
+"
+ load-table ATOM FILE -- Load CSV/TSV table from FILE.
+
+    Throws error if FILE does not exist.
+    More documentation TBD
+")
+
+; --------------------------------------------------------------------

From b862c41548a4f5b5a73a2fd1f1a19ef3487d5e25 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 01:09:51 +0300
Subject: [PATCH 47/56] Bug fix cut-n-paste error

---
 opencog/persist/csv/TableSCM.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/opencog/persist/csv/TableSCM.cc b/opencog/persist/csv/TableSCM.cc
index 489a89edb8..67a3283e57 100644
--- a/opencog/persist/csv/TableSCM.cc
+++ b/opencog/persist/csv/TableSCM.cc
@@ -46,7 +46,7 @@ class TableSCM : public ModuleWrap
 }  // namespace
 
 extern "C" {
-void opencog_persist_file_init(void);
+void opencog_csv_table_init(void);
 };
 
 #endif // _OPENCOG_CSV_TABLE_SCM_H
@@ -82,7 +82,7 @@ void TableSCM::load_table(const Handle& h, const std::string& path)
 	opencog::load_csv_table(h, path);
 }
 
-void opencog_persist_file_init(void)
+void opencog_csv_table_init(void)
 {
 	static TableSCM patty;
 }

From 6bd2075795f162b5fc53b81cf58844cf3b95cb25 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 10:05:39 +0300
Subject: [PATCH 48/56] Specify file path correctly

---
 tests/persist/csv/CSVLoadUTest.cxxtest | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest
index fa30e2d1b8..c05a2f1914 100644
--- a/tests/persist/csv/CSVLoadUTest.cxxtest
+++ b/tests/persist/csv/CSVLoadUTest.cxxtest
@@ -53,7 +53,7 @@ void CSVLoadUTest::test_simple_load()
 	Handle h = _asp->add_node(CONCEPT_NODE, "foo");
 
 	// Argh. Ugly. Fix.
-	load_csv_table(h, "../tests/persist/csv/simple.csv");
+	load_csv_table(h, PROJECT_SOURCE_DIR "/tests/persist/csv/simple.csv");
 
 	// There's the five columns, plus the table of contents.
 	HandleSet keys = h->getKeys();

From dbc24f0a330f6d2d1a66b8de05b980cf6f46b158 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 10:15:27 +0300
Subject: [PATCH 49/56] Mkae the AtoSpace an explicit argument

---
 opencog/persist/csv/TableSCM.cc   |  4 ++--
 opencog/persist/csv/table_read.cc | 15 ++++++++-------
 opencog/persist/csv/table_read.h  |  8 +++++---
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/opencog/persist/csv/TableSCM.cc b/opencog/persist/csv/TableSCM.cc
index 67a3283e57..7eaacd71dc 100644
--- a/opencog/persist/csv/TableSCM.cc
+++ b/opencog/persist/csv/TableSCM.cc
@@ -78,8 +78,8 @@ void TableSCM::init(void)
 
 void TableSCM::load_table(const Handle& h, const std::string& path)
 {
-	// const AtomSpacePtr& as = SchemeSmob::ss_get_env_as("load-table");
-	opencog::load_csv_table(h, path);
+	const AtomSpacePtr& as = SchemeSmob::ss_get_env_as("load-table");
+	opencog::load_csv_table(as, h, path);
 }
 
 void opencog_csv_table_init(void)
diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc
index a111049343..ae9186ec82 100644
--- a/opencog/persist/csv/table_read.cc
+++ b/opencog/persist/csv/table_read.cc
@@ -466,7 +466,8 @@ static double token_to_contin(const std::string& token)
 // of what is being done here.  In breif, columns from a table
 // are jammed into individual values on a given atom.
 static std::istream&
-istreamDenseTable(const Handle& anchor,
+istreamDenseTable(const AtomSpacePtr& as,
+                  const Handle& anchor,
                   std::istream& in,
                   const std::vector<unsigned>& ignore_idxs,
                   const std::vector<Type>& col_types,
@@ -560,8 +561,6 @@ istreamDenseTable(const Handle& anchor,
 	// and then each value under's its column name,
 	// all of these on the anchor atom.
 
-	// XXX TODO, we should probably take AtomSpace as an argument!?
-	AtomSpace* as = anchor->getAtomSpace();
 	size_t bc = 0;
 	size_t fc = 0;
 	size_t sc = 0;
@@ -611,7 +610,8 @@ istreamDenseTable(const Handle& anchor,
  * 2) Load the actual data.
  */
 std::istream&
-opencog::istreamTable(const Handle& anchor,
+opencog::istreamTable(const AtomSpacePtr& as,
+                      const Handle& anchor,
                       std::istream& in,
                       const std::vector<std::string>& ignore_features)
 {
@@ -635,14 +635,15 @@ opencog::istreamTable(const Handle& anchor,
 
 	in.seekg(beg);
 
-	return istreamDenseTable(anchor, in, ignore_indexes,
+	return istreamDenseTable(as, anchor, in, ignore_indexes,
 		col_types, header, has_header);
 }
 
 // ==================================================================
 
 // See header file for general description.
-void opencog::load_csv_table(const Handle& anchor,
+void opencog::load_csv_table(const AtomSpacePtr& as,
+                             const Handle& anchor,
                              const std::string& file_name,
                              const string_seq& ignore_features)
 {
@@ -653,7 +654,7 @@ void opencog::load_csv_table(const Handle& anchor,
 		throw RuntimeException(TRACE_INFO,
 			"Could not open %s", file_name.c_str());
 
-    istreamTable(anchor, in, ignore_features);
+    istreamTable(as, anchor, in, ignore_features);
 }
 
 // ==================================================================
diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h
index 4df8031530..a4504fb519 100644
--- a/opencog/persist/csv/table_read.h
+++ b/opencog/persist/csv/table_read.h
@@ -31,7 +31,7 @@
 #include <string>
 #include <vector>
 
-#include <opencog/atoms/value/Value.h>
+#include <opencog/atomspace/AtomSpace.h>
 
 namespace opencog {
 
@@ -74,7 +74,8 @@ typedef std::vector<std::string> string_seq;
  * @param file_name
  * @param ignore_features
  */
-void load_csv_table(const Handle& anchor,
+void load_csv_table(const AtomSpacePtr&,
+                    const Handle& anchor,
                     const std::string& file_name,
                     const string_seq& ignore_features=string_seq());
 
@@ -83,7 +84,8 @@ void load_csv_table(const Handle& anchor,
 //    const std::vector<unsigned>& ignored_indices=std::vector<unsigned>());
 
 // Same as above, but works for an already-open stream.
-std::istream& istreamTable(const Handle&,
+std::istream& istreamTable(const AtomSpacePtr&,
+                           const Handle&,
                            std::istream&,
                            const string_seq& ignore_features);
 

From eb8adb0acef1601ba2378af97bc8a8cb26886855 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 11:01:47 +0300
Subject: [PATCH 50/56] Start work on a table demo.

---
 examples/atomspace/table.csv | 22 ++++++++++++++++++++++
 examples/atomspace/table.scm | 30 ++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 examples/atomspace/table.csv
 create mode 100644 examples/atomspace/table.scm

diff --git a/examples/atomspace/table.csv b/examples/atomspace/table.csv
new file mode 100644
index 0000000000..7e805e855a
--- /dev/null
+++ b/examples/atomspace/table.csv
@@ -0,0 +1,22 @@
+#
+# This is a simple demo CSV file.
+# It contains a table of data, in comma-separated-value format.
+# You can also use tab-separated values.
+#
+# This table contains a text column header.
+# The column labels can be anything.
+# If the header is absent, default labels will be generated.
+#
+b1, b2, b3, flt1, flt2, lbl
+
+# Now for some data. Three columns of binary numbers,
+# Two floats, and one column of strings.
+   0, 0, 1, 3.3, 4.4, "one"
+   0, 0, 1, 4.4, 5.5, "one"
+   0, 1, 1, 3.4, 6.5, "three"
+   1, 0, 1, 2.4, 7.5, "five"
+
+# T and F are maybe better for binary ...
+   T, F, T, 4, 9, "five"
+   T, T, F, 5, 11, "six"
+   T, T, T, 2, 8.9, "seven"
diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm
new file mode 100644
index 0000000000..58096676a6
--- /dev/null
+++ b/examples/atomspace/table.scm
@@ -0,0 +1,30 @@
+;
+; table.scm -- Formulas applied to Values from a CSV table.
+;
+; Similar to the flows.scm demo.
+;
+(use-modules (opencog) (opencog exec))
+(use-modules (opencog csv-table))
+
+; Create an Atom on which the table will be located.
+(define tab (Concept "My foo Table"))
+
+; Load the table (located in this directory.)
+(load-table tab "table.csv")
+
+; Verify that the table loaded. First, take a look at all of the keys:
+(cog-keys tab)
+
+; The ordered list of all the columns will be located at the
+; "well-known predicate". All tables will have this; it is an
+; ordered list of the columns in the table (in the same order
+; as the file.)
+(define colkeys (Predicate "*-column-keys-*"))
+(cog-value tab colkeys)
+
+; Verify that the data for each column is present.
+; Loop over the columns, and print the keys and values on them.
+(for-each
+	(lambda (KEY) 
+		(format #t "The key ~A   holds data ~A\n" KEY (cog-value tab KEY)))
+	(cog-value->list (cog-value tab colkeys)))

From e56a4af9424048c97e6e08e28b5910e4e86ba05e Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 11:14:31 +0300
Subject: [PATCH 51/56] Announce the demo

---
 examples/atomspace/README.md |  1 +
 examples/atomspace/table.scm | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/examples/atomspace/README.md b/examples/atomspace/README.md
index 03aa741ab6..776a728940 100644
--- a/examples/atomspace/README.md
+++ b/examples/atomspace/README.md
@@ -51,6 +51,7 @@ first).
 * `values.scm`         -- Using Values and attaching them to Atoms.
 * `stream.scm`         -- Using a stream of time-varying Values.
 * `formulas.scm`       -- Representing arithmetic and computing Values.
+* `table.scm`          -- Fetching Values from a CSV/TSV table.
 * `flows.scm`          -- Flowing Values around.
 * `flow-formulas.scm`  -- Dynamically updating value flows.
 * `multi-space.scm`    -- Using multiple AtomSpaces at once.
diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm
index 58096676a6..58d8fe9590 100644
--- a/examples/atomspace/table.scm
+++ b/examples/atomspace/table.scm
@@ -1,7 +1,11 @@
 ;
-; table.scm -- Formulas applied to Values from a CSV table.
+; table.scm -- Formulas applied to Values from a CSV/TSV table.
 ;
-; Similar to the flows.scm demo.
+; This is similar to the `flows.scm` demo, except that the values
+; are feteched from a convetional DSV (delimiter-separated-value)
+; table. The demo is in two parts. The first part reads the table,
+; (a one-liner) and explores how it is represented in the AtomSpace.
+; The second part applies some formulas to the table columns.
 ;
 (use-modules (opencog) (opencog exec))
 (use-modules (opencog csv-table))
@@ -28,3 +32,18 @@
 	(lambda (KEY) 
 		(format #t "The key ~A   holds data ~A\n" KEY (cog-value tab KEY)))
 	(cog-value->list (cog-value tab colkeys)))
+;
+; -------------------------------------------------------------------
+; Part two: apply some formulas to the columns.
+;
+; Note that cog-value and cog-execute! ValueOf return the same thing:
+(cog-value tab (PredicateNode "flt1"))
+(cog-execute! (ValueOf tab (PredicateNode "flt1")))
+
+(cog-execute!
+	(Minus
+		(ValueOf tab (PredicateNode "flt2"))
+		(ValueOf tab (PredicateNode "flt1"))))
+
+; That's all, folks.
+; -------------------------------------------------------------------

From 577aa9bb9c69a401d02b3e6f5d9abfaf9ae4662d Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 11:39:42 +0300
Subject: [PATCH 52/56] Must use FloatValueOf not ValueOf

---
 examples/atomspace/README.md |  2 +-
 examples/atomspace/flows.scm |  4 ++--
 examples/atomspace/table.scm | 27 ++++++++++++++++++++++++---
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/examples/atomspace/README.md b/examples/atomspace/README.md
index 776a728940..66477924cc 100644
--- a/examples/atomspace/README.md
+++ b/examples/atomspace/README.md
@@ -51,9 +51,9 @@ first).
 * `values.scm`         -- Using Values and attaching them to Atoms.
 * `stream.scm`         -- Using a stream of time-varying Values.
 * `formulas.scm`       -- Representing arithmetic and computing Values.
-* `table.scm`          -- Fetching Values from a CSV/TSV table.
 * `flows.scm`          -- Flowing Values around.
 * `flow-formulas.scm`  -- Dynamically updating value flows.
+* `table.scm`          -- Fetching Values from a CSV/TSV table.
 * `multi-space.scm`    -- Using multiple AtomSpaces at once.
 
 After going through the above, go to the demos in the
diff --git a/examples/atomspace/flows.scm b/examples/atomspace/flows.scm
index 9bc65ac451..139776676f 100644
--- a/examples/atomspace/flows.scm
+++ b/examples/atomspace/flows.scm
@@ -140,7 +140,7 @@
 
 ; Try out some math
 (cog-execute! (SetValue bar kee
-	(Times (ValueOf foo key) (ValueOf foo key))))
+	(Times (FloatValueOf foo key) (FloatValueOf foo key))))
 
 ; Verify
 (cog-execute! (ValueOf bar kee))
@@ -162,6 +162,6 @@
 (cog-execute!
 	(SetValue bar kee
 		(DefinedSchema "triangle numbers")
-		(ValueOf foo key)))
+		(FloatValueOf foo key)))
 ;
 ; -------- THE END -----------
diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm
index 58d8fe9590..604e8fc50b 100644
--- a/examples/atomspace/table.scm
+++ b/examples/atomspace/table.scm
@@ -36,14 +36,35 @@
 ; -------------------------------------------------------------------
 ; Part two: apply some formulas to the columns.
 ;
-; Note that cog-value and cog-execute! ValueOf return the same thing:
+; Note that `cog-value` and `cog-execute! ValueOf` return the same thing:
 (cog-value tab (PredicateNode "flt1"))
 (cog-execute! (ValueOf tab (PredicateNode "flt1")))
 
+; Take the difference of two columns. Note that `FloatValueOf` is
+; used instead of `ValueOf`, so that the type-checking subsystem
+; is happy about the types passed to the operator.
 (cog-execute!
 	(Minus
-		(ValueOf tab (PredicateNode "flt2"))
-		(ValueOf tab (PredicateNode "flt1"))))
+		(FloatValueOf tab (PredicateNode "flt2"))
+		(FloatValueOf tab (PredicateNode "flt1"))))
+
+; The above can be wrapped into a function. Several examples follow,
+; below. First, a function that takes the table as an argument,
+; subtracts to columns, and places the result in a third column.
+; The column names are hard-coded in the function.
+
+(DefineLink
+	(DefinedSchema "col diffs")
+   (Lambda
+      (Variable "$tbl-name")
+		(SetValue
+			(Variable "$tbl-name") (Predicate "f2 minus f1")
+			(Minus
+				(FloatValueOf (Variable "$tbl-name") (PredicateNode "flt2"))
+				(FloatValueOf (Variable "$tbl-name") (PredicateNode "flt1"))))))
+
+(cog-execute!  (DefinedSchema "col diffs") tab)
+
 
 ; That's all, folks.
 ; -------------------------------------------------------------------

From 9e61cc08c9e766545b668ee59779b287e32749ea Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 11:46:17 +0300
Subject: [PATCH 53/56] Update unit test to use the new API.

---
 tests/persist/csv/CSVLoadUTest.cxxtest | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest
index c05a2f1914..05e5a85322 100644
--- a/tests/persist/csv/CSVLoadUTest.cxxtest
+++ b/tests/persist/csv/CSVLoadUTest.cxxtest
@@ -53,7 +53,7 @@ void CSVLoadUTest::test_simple_load()
 	Handle h = _asp->add_node(CONCEPT_NODE, "foo");
 
 	// Argh. Ugly. Fix.
-	load_csv_table(h, PROJECT_SOURCE_DIR "/tests/persist/csv/simple.csv");
+	load_csv_table(_asp, h, PROJECT_SOURCE_DIR "/tests/persist/csv/simple.csv");
 
 	// There's the five columns, plus the table of contents.
 	HandleSet keys = h->getKeys();

From f6df9940d1f618a39cbd73787c1b5738c9d92ec5 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 11:56:10 +0300
Subject: [PATCH 54/56] Provide a scoring function example.

---
 examples/atomspace/table.scm | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm
index 604e8fc50b..800718b907 100644
--- a/examples/atomspace/table.scm
+++ b/examples/atomspace/table.scm
@@ -7,6 +7,8 @@
 ; (a one-liner) and explores how it is represented in the AtomSpace.
 ; The second part applies some formulas to the table columns.
 ;
+; The second part of the demo
+;
 (use-modules (opencog) (opencog exec))
 (use-modules (opencog csv-table))
 
@@ -63,8 +65,35 @@
 				(FloatValueOf (Variable "$tbl-name") (PredicateNode "flt2"))
 				(FloatValueOf (Variable "$tbl-name") (PredicateNode "flt1"))))))
 
-(cog-execute!  (DefinedSchema "col diffs") tab)
+; Apply the function to the table.
+(cog-execute! (Put (DefinedSchema "col diffs") tab))
+
+; Verify that the new column showed up.
+(cog-keys tab)
+
+; .. and that it contains the expected data.
+(cog-value tab (Predicate "f2 minus f1"))
+
+;--------
+; The AccumulateLink can be used to sum up all of the rows in a column.
+(cog-execute!
+	(Accumulate (FloatValueOf tab (Predicate "f2 minus f1"))))
+
+; This can be turned into a simple scoring function. It computes the
+; sum-total of the difference of two columns. This is a score, in that
+; it is a single number that can be used as a utility function in
+; conventional machine-learning algos.
+(DefineLink
+	(DefinedSchema "compute score")
+   (Lambda
+      (Variable "$tbl-name")
+		(Accumulate
+			(Minus
+				(FloatValueOf (Variable "$tbl-name") (PredicateNode "flt2"))
+				(FloatValueOf (Variable "$tbl-name") (PredicateNode "flt1"))))))
 
+; Apply the function to the table.
+(cog-execute! (Put (DefinedSchema "compute score") tab))
 
 ; That's all, folks.
 ; -------------------------------------------------------------------

From 1aec9200db8d1773b3d77ff3d3ef3bd993065192 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 12:02:25 +0300
Subject: [PATCH 55/56] Add explanation of the demo

---
 examples/atomspace/table.scm | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm
index 800718b907..79c447c7ff 100644
--- a/examples/atomspace/table.scm
+++ b/examples/atomspace/table.scm
@@ -7,7 +7,18 @@
 ; (a one-liner) and explores how it is represented in the AtomSpace.
 ; The second part applies some formulas to the table columns.
 ;
-; The second part of the demo
+; The second part of the demo is intereasting, because it shows how
+; functions, written in Atomese, can be applied to tables, and how
+; a "utility function" or a "scoring function" can be written.
+; Utility functions are commonly used in machine learning, they
+; provide a grand-total score that can be maximized or minized during
+; training. The interesting point here is that the scoring function
+; is represented in Atomese: it is some tree, some DAG of inputs.
+; These trees can be randomly generated and mutated, thus allowing
+; genetic-programming algorithms to be implemented in the AtomSpace.
+;
+; This is. of course, exactly what AS-MOSES does. This is effectively
+; a demo of a sub-component of the AS-MOSES subsystem.
 ;
 (use-modules (opencog) (opencog exec))
 (use-modules (opencog csv-table))

From 54f05f471dec32eeb36c546a1aa63a6a51bee4d3 Mon Sep 17 00:00:00 2001
From: Linas Vepstas <linas@linas.org>
Date: Sun, 21 Aug 2022 12:09:13 +0300
Subject: [PATCH 56/56] List additional modules.

---
 examples/atomspace/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/atomspace/README.md b/examples/atomspace/README.md
index 66477924cc..8c42c2f6ff 100644
--- a/examples/atomspace/README.md
+++ b/examples/atomspace/README.md
@@ -193,6 +193,7 @@ everything else depends on.
 ```
 (use-modules (opencog))
 (use-modules (opencog atom-types))
+(use-modules (opencog csv-table))
 (use-modules (opencog exec))
 (use-modules (opencog logger))
 (use-modules (opencog matrix))
@@ -202,9 +203,11 @@ everything else depends on.
 (use-modules (opencog persist-rocks))
 (use-modules (opencog persist-sql))
 (use-modules (opencog python))
+(use-modules (opencog randgen))
 (use-modules (opencog sheaf))
 (use-modules (opencog test-runner))
 (use-modules (opencog type-utils))
+(use-modules (opencog uuid))
 ```
 
 There are other modules provided in other projects and repos. Here is