From 3c180e83c53f7be1b996ad97cc6102888a8aae91 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Tue, 2 Aug 2022 10:48:14 +0200 Subject: [PATCH 01/56] Start work on a CSV loader. --- opencog/persist/README.md | 7 ++++ opencog/persist/csv/load_csv.h | 62 ++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 opencog/persist/csv/load_csv.h diff --git a/opencog/persist/README.md b/opencog/persist/README.md index e40197a653..81de0b5ef6 100644 --- a/opencog/persist/README.md +++ b/opencog/persist/README.md @@ -17,6 +17,13 @@ Local subdirectories include: for RocksDB and one that allows AtomSpaces to trade Atoms over the network.) +* csv -- Load Values from CSV/TSV files. Each column in the CSV + table is loaded into an appropriate Value (`FloatValue`, + `BoolValue` or `StringValue`). The values are placed + under keys (named after the column) on the provided Atom. + This is intended for the ASMOSES subsystem, which + naturally operates on tables or streams of data. + * file -- Read and write files containing Atomese s-expressions. Provides both a `FileStorageNode`, and also some utilities to read files, and dump Atomspace contents to files or diff --git a/opencog/persist/csv/load_csv.h b/opencog/persist/csv/load_csv.h new file mode 100644 index 0000000000..6a2f2b45ae --- /dev/null +++ b/opencog/persist/csv/load_csv.h @@ -0,0 +1,62 @@ +/** load_csv.h --- + * + * Copyright (C) 2018 OpenCog Foundation + * Copyright (C) 2022 Linas Vepstas + * + * Author: Yidnekachew Wondimu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _ATOMESE_LOAD_CSV_H +#define _ATOMESE_LOAD_CSV_H + +namespace opencog { + +/** + * Load columns from a CSV file and place them into Atomese Values on + * the indicated Atom. Atomese Values are vectors (of floats, bools, + * srings, or more complex structures). Each Value holds one column + * from the dataset. + * + * The features (columns) specified in ignore_features will be omitted + * from the representation. + * + * For example, a CSV dataset like this: + * o, i1, i2, i3, i4 + * 1, 0, 0, 3.3, "foo" + * 0, 1, 0, 4.4, "bar" + * + * will be loaded as the following key-value pairs on the `anchor` Atom: + * (Predicate "*-column-names-*") (StringValue "o", "i1", "i2", "i3", "i4") + * (Predicate "o") (BoolValue 1 0) + * (Predicate "i1") (BoolValue 0 1) + * (Predicate "i2") (BoolValue 0 0) + * (Predicate "i3") (FloatValue 3.3 4.4) + * (Predicate "i4") (StringValue "foo" "bar") + * + * @param file_name + * @param ignore_features + * @return + */ +void load_csv_table( + const Handle& anchor, + const std::string& file_name, + const std::vector& ignore_features=std::vector()); + +} // end namespace opencog + +#endif //_ATOMESE_LOAD_CSV_H From f50a2038d152d5bf90a9c2c7b411bee223d44498 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Tue, 2 Aug 2022 10:54:14 +0200 Subject: [PATCH 02/56] initial scaffolding for csv tables --- opencog/persist/csv/load_csv.cc | 55 +++++++++++++++++++++++++++++++++ opencog/persist/csv/load_csv.h | 31 +++---------------- 2 files changed, 59 insertions(+), 27 deletions(-) create mode 100644 opencog/persist/csv/load_csv.cc diff --git a/opencog/persist/csv/load_csv.cc b/opencog/persist/csv/load_csv.cc new file mode 100644 index 0000000000..f69885c879 --- /dev/null +++ b/opencog/persist/csv/load_csv.cc @@ -0,0 +1,55 @@ +/** + * load_csv.cc -- Load CSV tables into Values + * + * Copyright (C) 2022 Linas Vepstas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +using namespace opencog; + +/** + * Load columns from a CSV file and place them into Atomese Values on + * the indicated Atom. Atomese Values are vectors (of floats, bools, + * srings, or more complex structures). Each Value holds one column + * from the dataset. + * + * The features (columns) specified in ignore_features will be omitted + * from the representation. + * + * For example, a CSV dataset like this: + * o, i1, i2, i3, i4 + * 1, 0, 0, 3.3, "foo" + * 0, 1, 0, 4.4, "bar" + * + * will be loaded as the following key-value pairs on the `anchor` Atom: + * (Predicate "*-column-names-*") (StringValue "o", "i1", "i2", "i3", "i4") + * (Predicate "o") (BoolValue 1 0) + * (Predicate "i1") (BoolValue 0 1) + * (Predicate "i2") (BoolValue 0 0) + * (Predicate "i3") (FloatValue 3.3 4.4) + * (Predicate "i4") (StringValue "foo" "bar") + * + * @param file_name + * @param ignore_features + * @return + */ +void load_csv_table( + const Handle& anchor, + const std::string& file_name, + const std::vector& ignore_features) +{ +} diff --git a/opencog/persist/csv/load_csv.h b/opencog/persist/csv/load_csv.h index 6a2f2b45ae..a28a0b56ae 100644 --- a/opencog/persist/csv/load_csv.h +++ b/opencog/persist/csv/load_csv.h @@ -1,4 +1,5 @@ -/** load_csv.h --- +/** + * load_csv.h -- Load CSV tables into Values * * Copyright (C) 2018 OpenCog Foundation * Copyright (C) 2022 Linas Vepstas @@ -26,32 +27,8 @@ namespace opencog { -/** - * Load columns from a CSV file and place them into Atomese Values on - * the indicated Atom. Atomese Values are vectors (of floats, bools, - * srings, or more complex structures). Each Value holds one column - * from the dataset. - * - * The features (columns) specified in ignore_features will be omitted - * from the representation. - * - * For example, a CSV dataset like this: - * o, i1, i2, i3, i4 - * 1, 0, 0, 3.3, "foo" - * 0, 1, 0, 4.4, "bar" - * - * will be loaded as the following key-value pairs on the `anchor` Atom: - * (Predicate "*-column-names-*") (StringValue "o", "i1", "i2", "i3", "i4") - * (Predicate "o") (BoolValue 1 0) - * (Predicate "i1") (BoolValue 0 1) - * (Predicate "i2") (BoolValue 0 0) - * (Predicate "i3") (FloatValue 3.3 4.4) - * (Predicate "i4") (StringValue "foo" "bar") - * - * @param file_name - * @param ignore_features - * @return - */ +// Load columns from a CSV file and place them into Atomese Values on +// the indicated Atom. See the .cc file for additional info. void load_csv_table( const Handle& anchor, const std::string& file_name, From b23a69ea6252bf5481eb82435be32b5bd0539bb9 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Tue, 2 Aug 2022 11:09:04 +0200 Subject: [PATCH 03/56] Copy code from asmoses --- opencog/persist/csv/table_io.cc | 1479 +++++++++++++++++++++++++++++++ opencog/persist/csv/table_io.h | 265 ++++++ 2 files changed, 1744 insertions(+) create mode 100644 opencog/persist/csv/table_io.cc create mode 100644 opencog/persist/csv/table_io.h diff --git a/opencog/persist/csv/table_io.cc b/opencog/persist/csv/table_io.cc new file mode 100644 index 0000000000..1f80c8fdc6 --- /dev/null +++ b/opencog/persist/csv/table_io.cc @@ -0,0 +1,1479 @@ +/** table_io.cc --- + * + * Copyright (C) 2010 OpenCog Foundation + * Copyright (C) 2012 Poulin Holdings LLC + * + * Authors: Nil Geisweiller + * Linas Vepstas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include "table.h" +#include "table_io.h" + +namespace opencog { namespace combo { + +using namespace std; +using namespace boost; +using namespace boost::phoenix; +using boost::phoenix::arg_names::arg1; + +// ------------------------------------------------------- + +bool checkCarriageReturn(istream& in) +{ + char next_c = in.get(); + if (next_c == '\r') // DOS format + next_c = in.get(); + if (next_c == '\n') + return true; + return false; +} + +void removeCarriageReturn(string& str) +{ + size_t s = str.size(); + if ((s > 0) && (str[s-1] == '\r')) + str.resize(s-1); +} + +//* Remove non-ascii characters at the bigining of the line, only. +void removeNonASCII(string& str) +{ + while (str.size() && (unsigned char)str[0] > 127) + str = str.substr(1); +} + +// ------------------------------------------------------- +// Return true if the character is one of the standard comment +// delimiters. Here, we define a 'standard delimiter' as one +// of hash, bang or semicolon. +bool is_comment(const char c) +{ + if ('#' == c) return true; + if (';' == c) return true; + if ('!' == c) return true; + if ('\n' == c) return true; + if ('\r' == c) return true; + if (0 == c) return true; + return false; +} + +/// Get one line of actual data. +/// This ignores lines that start with a 'standard comment char' +/// +// +// TODO: This routine should be extended so that comments that start +// somewhere other than column 0 are also ignored. +// +// The signature of this routine is the same as std:getline() +// +istream &get_data_line(istream& is, string& line) +{ + while (1) + { + getline(is, line); + if (!is) return is; + if (is_comment(line[0])) continue; + + // Remove weird symbols at the start of the line (only). + removeNonASCII(line); + // Remove carriage return at end of line (for DOS files). + removeCarriageReturn(line); + + return is; + } +} + +// ------------------------------------------------------- + +static const char *sparse_delim = " : "; + +/** + * parse a pair of key/value in a parse dataset, using ':' as + * delimiter. For instance + * + * parse_key_val("key : val") + * + * returns + * + * {"key", "val"} + * + * If no such delimiter is found then it return a pair with empty key + * and empty val. + */ +static pair +parse_key_val(string chunk) +{ + pair res; + size_t pos = chunk.find(sparse_delim); + if (string::npos == pos) + return res; + string key = chunk.substr(0, pos); + boost::trim(key); + string val = chunk.substr(pos + strlen(sparse_delim)); + boost::trim(val); + return {key, val}; +} + +/** + * Take a row, return a tokenizer. Tokenization uses the + * separator characters comma, blank, tab (',', ' ' or '\t'). + */ +table_tokenizer get_row_tokenizer(const std::string& line) +{ + typedef boost::escaped_list_separator separator; + typedef boost::tokenizer tokenizer; + + // Tokenize line; currently, we allow tabs, commas, blanks. + static const separator sep("\\", ",\t ", "\""); + return tokenizer(line, sep); +} + +// Same as above, but only allow commas as a column separator. +table_tokenizer get_sparse_row_tokenizer(const string& line) +{ + typedef boost::escaped_list_separator separator; + typedef boost::tokenizer tokenizer; + + // Tokenize line; currently, we allow tabs, commas, blanks. + static const separator sep("\\", ",", "\""); + return tokenizer(line, sep); +} + +/** + * Take a line and return a vector containing the elements parsed. + * Used by istreamTable. This will modify the line to remove leading + * non-ASCII characters, as well as stripping of any carriage-returns. + */ +vector tokenizeSparseRow(const string& line) +{ + table_tokenizer tok = get_sparse_row_tokenizer(line); + vector res; + for (string t : tok) { + boost::trim(t); + res.push_back(t); + } + return res; +} + +// ------------------------------------------------------- +/** + * Given an input string, guess the type of the string. + * Inferable types are: boolean, contin and enum. + */ +type_node infer_type_from_token(const string& token) +{ + /* Prefered representation is T's and 0's, to maximize clarity, + * readability. Numeric values are easily confused with contin + * type. + */ + if (token == "0" || + token == "1" || + token == "T" || + token == "F" || + token == "t" || + token == "f") + return id::boolean_type; + + // If it starts with an alphabetic character, assume its a string + else if (isalpha(token[0])) + return id::enum_type; + + // Hope that we can cast this to a float point number. + else { + try { + lexical_cast(token); + return id::contin_type; + } + catch(...) { + return id::ill_formed_type; + } + } +} + +/** + * Given an input string, guess the type of the string. + * Inferable types are: boolean, contin and enum. + * Compare this to 'curr_guess', and upgrade the type inference + * if it can be done consistently. + */ +static type_node +infer_type_from_token2(type_node curr_guess, const string& token) +{ + type_node tokt = infer_type_from_token(token); + + // First time, just go with the flow. + if (id::unknown_type == curr_guess) + return tokt; + + // Yayy! its consistent! + if (tokt == curr_guess) + return tokt; + + // If we saw 0,1 when expecting a contin, its a contin. + if ((id::contin_type == curr_guess) && (id::boolean_type == tokt)) + return curr_guess; + + // If we thought its a boolean 0,1 it might be a contin. + if ((id::boolean_type == curr_guess) && (id::contin_type == tokt)) + return tokt; + + // If we got to here, then there's some sort of unexpected + // inconsistency in the column types; we've got to presume that + // its just some crazy ascii string, i.e. enum_type. + return id::enum_type; +} + +/// cast string "token" to a vertex of type "tipe" +builtin token_to_boolean(const string& token) +{ + if ("0" == token || "F" == token || "f" == token) + return id::logical_false; + else if ("1" == token || "T" == token || "t" == token) + return id::logical_true; + else { + OC_ASSERT(false, "Expecting boolean value, got %s", token.c_str()); + return builtin(); + } +} +contin_t token_to_contin(const string& token) +{ + try { + return lexical_cast(token); + } catch(boost::bad_lexical_cast&) { + OC_ASSERT(false, "Could not cast %s to contin", token.c_str()); + return contin_t(); + } +} +vertex token_to_vertex(const type_node &tipe, const string& token) +{ + switch (tipe) { + + case id::boolean_type: + return token_to_boolean(token); + + case id::contin_type: + return token_to_contin(token); + + case id::enum_type: + // Enum types must begin with an alpha character + if (isalpha(token[0])) + return enum_t(token); + OC_ASSERT(false, "Enum type must begin with alphabetic char, but %s doesn't", token.c_str()); + break; + + case id::definite_object_type: + return token; + break; + + // Ugly hack ... the problem adressed here is that feature + // selection has to read and propagate columns of unknown type + // (typically, dates, times). So we hack around this here. + case id::ill_formed_type: + return enum_t(token); + // return id::ill_formed_type; + // return id::null_vertex; + break; + + default: + stringstream ss; + ss << "Unable to convert token \"" << token << "\" to type=" << tipe << endl; + OC_ASSERT(0, ss.str().c_str()); + } + + // unreachable + return id::null_vertex; +} + +// =========================================================== +// istream regular tables. + +/** + * Fill the input table, given a file in DSV (delimiter-seperated values) + * format. The delimiters are ',', ' ' or '\t'. + * + * It stuffs all data into the table as strings; type conversion to + * the appropriate type, and thunking for the header, and ignoring + * certain features, must all be done as a separate step. + */ +istream& istreamRawITable(istream& in, ITable& tab, + const vector& ignored_indices) +{ + streampos beg = in.tellg(); + + // Get the entire dataset into memory + string line; + std::vector lines; + + // Read first few by hand. The first might be labels, so we must + // get at least the second line. But the second line might have + // all default feature values (i.e. no colon), so get the third... + dorepeat(20) { + if (!get_data_line(in, line)) + break; + // If it is a sparse file, we are outta here. + // Throw an std::exception, since we don't want to log this as an + // error (all the other exception types log to the log file). + if (string::npos != line.find (sparse_delim)) { + in.seekg(beg); + throw std::exception(); + } + lines.push_back(line); + } + + // Grab the rest of the file. + while (get_data_line(in, line)) + lines.push_back(line); + + // Determine the arity from the first line. + vector fl = tokenizeRow(lines[0], ignored_indices); + arity_t arity = fl.size(); + + std::atomic arity_fail_row(-1); + auto parse_line = [&](size_t i) + { + // tokenize the line and fill the table with + tab[i] = tokenizeRow(lines[i], ignored_indices); + + // Check arity + if (arity != (arity_t)tab[i].size()) + arity_fail_row = i + 1; + }; + + // Vector of indices [0, lines.size()) + size_t ls = lines.size(); + tab.resize(ls); + auto ir = boost::irange((size_t)0, ls); + vector indices(ir.begin(), ir.end()); + OMP_ALGO::for_each(indices.begin(), indices.end(), parse_line); + + if (-1 != arity_fail_row) { + in.seekg(beg); + OC_ASSERT(false, + "ERROR: Input file inconsistent: the %uth row has " + "a different number of columns than the rest of the file. " + "All rows should have the same number of columns.\n", + arity_fail_row.load()); + } + return in; +} + +vector get_header(const string& file_name) +{ + ifstream in(file_name.c_str()); + string line; + get_data_line(in, line); + return tokenizeRow(line); +} + +// =========================================================== +/** + * Visitor to parse a list of strings (buried in a multi_type_seq) + * into a multi_type_seq containing the typed values given the input + * type signature. + */ +struct from_tokens_visitor : public boost::static_visitor +{ + from_tokens_visitor(const type_node_seq& types) : _types(types) { + all_boolean = boost::count(types, id::boolean_type) == (int)types.size(); + all_contin = boost::count(types, id::contin_type) == (int)types.size(); + } + result_type operator()(const string_seq& seq) { + result_type res; + if (all_boolean) { + res = builtin_seq(); + builtin_seq& bs = res.get_seq(); + boost::transform(seq, back_inserter(bs), token_to_boolean); + } + else if (all_contin) { + res = contin_seq(); + contin_seq& cs = res.get_seq(); + boost::transform(seq, back_inserter(cs), token_to_contin); + } + else { + res = vertex_seq(); + vertex_seq& vs = res.get_seq(); + boost::transform(_types, seq, back_inserter(vs), token_to_vertex); + } + return res; + } + template result_type operator()(const Seq& seq) { + OC_ASSERT(false, "You are not supposed to do that"); + return result_type(); + } + const type_node_seq& _types; + bool all_boolean, all_contin; +}; + + +/** + * The class below tokenizes one row, and jams it into the table + */ +struct from_sparse_tokens_visitor : public from_tokens_visitor +{ + from_sparse_tokens_visitor(const type_node_seq& types, + const std::map& index, + size_t fixed_arity) + : from_tokens_visitor(types), _index(index), _fixed_arity(fixed_arity) {} + result_type operator()(const string_seq& seq) { + using std::transform; + using std::for_each; + result_type res; + if (all_boolean) { + res = builtin_seq(_types.size(), id::logical_false); + builtin_seq& bs = res.get_seq(); + auto begin_sparse = seq.begin() + _fixed_arity; + transform(seq.begin(), begin_sparse, bs.begin(), token_to_boolean); + for (auto it = begin_sparse; it != seq.end(); ++it) { + auto key_val = parse_key_val(*it); + if (key_val != std::pair()) { + size_t idx = _index.at(key_val.first); + bs[idx] = token_to_boolean(key_val.second); + } + } + } + else if (all_contin) { + res = contin_seq(_types.size(), 0.0); + contin_seq& cs = res.get_seq(); + auto begin_sparse = seq.cbegin() + _fixed_arity; + transform(seq.begin(), begin_sparse, cs.begin(), token_to_contin); + for (auto it = begin_sparse; it != seq.end(); ++it) { + auto key_val = parse_key_val(*it); + if (key_val != std::pair()) { + size_t idx = _index.at(key_val.first); + cs[idx] = token_to_contin(key_val.second); + } + } + } + else { + res = vertex_seq(_types.size()); + vertex_seq& vs = res.get_seq(); + auto begin_sparse_types = _types.cbegin() + _fixed_arity; + auto begin_sparse_seq = seq.cbegin() + _fixed_arity; + transform(_types.begin(), begin_sparse_types, + seq.begin(), vs.begin(), token_to_vertex); + for (auto it = begin_sparse_seq; it != seq.end(); ++it) { + auto key_val = parse_key_val(*it); + if (key_val != std::pair()) { + size_t idx = _index.at(key_val.first); + vs[idx] = token_to_vertex(_types[idx], key_val.second); + } + } + } + return res; + } + std::map _index; + size_t _fixed_arity; +}; + + +// =========================================================== +/** + * Fill the input table, given a file in 'sparse' format. + * + * The sparse table format consists of some fixed number of columns, + * in comma-separated format, followed by key-value pairs, also + * tab-separated. viz: + * + * val, val, val, name:val, name:val, name:val + * + * Thus, for example, a row such as + * + * earn, issued : 1, results : 2, ending : 1, including : 1 + * + * indicates that there one fixed column, of enum type, (the enum value + * being "earn"), and that features called "issued", "ending" and + * "including" have a contin value of 1.0 and "results" has a contin + * value of 2. + * + * The routine does NOT store the table in sparse format: it stores the + * full, exploded table. This could be bad ... + * TODO: we really need a sparse table format, as well. + * + * The "Raw" format has all data as strings; type conversion to the + * appropriate type, must all be done as a separate step. + */ +istream& istreamSparseITable(istream& in, ITable& tab) +{ + // The raw dataset + std::vector lines; + + // The first non-comment line is assumed to be the header. + // ... unless it isn't. (The header must not contain a colon). + vector labs; + size_t fixed_arity = 0; + string header; + get_data_line(in, header); + if (string::npos == header.find(sparse_delim)) { + // Determine the arity of the fixed columns + vector hdr = tokenizeSparseRow(header); + fixed_arity = hdr.size(); + labs = hdr; + } + else { + lines.push_back(header); + } + + // Get the entire dataset into memory + string iline; + while (get_data_line(in, iline)) + lines.push_back(iline); + + if (0 == fixed_arity) { + vector fixy = tokenizeSparseRow(lines[0]); + // count commas, until a semi-colon is found. + while (string::npos == fixy[fixed_arity].find(sparse_delim)) + fixed_arity++; + } + logger().info() << "Sparse file fixed column count=" << fixed_arity; + + // Get a list of all of the features. + set feats; + // All sparse features have the same type. + type_node feat_type = id::unknown_type; + + // Fixed features may have different types, by column. + type_node_seq types(fixed_arity, id::unknown_type); + + for (const string& line : lines) { + vector chunks = tokenizeSparseRow(line); + vector::const_iterator pit = chunks.begin(); + + // Infer the types of the fixed features. + size_t off = 0; + for (; off < fixed_arity; ++off, ++pit) + types[off] = infer_type_from_token2(types[off], *pit); + + for (; pit != chunks.end(); ++pit) { + // Rip out the key-value pairs + auto key_val = parse_key_val(*pit); + if (key_val == pair()) + break; + // Store the key, uniquely. Store best guess as the type. + feats.insert(key_val.first); + feat_type = infer_type_from_token2(feat_type, key_val.second); + } + } + logger().info() << "Sparse file unique features count=" << feats.size(); + logger().info() << "Sparse file feature type=" << feat_type; + logger().info() << "Sparse file row count=" << lines.size(); + + // Convert the feature set into a list of labels. + // 'index' is a map from feature name to column number. + size_t cnt = fixed_arity; + map index; + for (const string& key : feats) { + types.push_back(feat_type); + labs.push_back(key); + index[key] = cnt; + cnt++; + } + tab.set_labels(labs); + tab.set_types(types); + + // And finally, stuff up the table. + from_sparse_tokens_visitor fstv(types, index, fixed_arity); + auto fill_line = [&](int i) + { + const string& line = lines[i]; + // Tokenize the line + vector chunks = tokenizeSparseRow(line); + multi_type_seq row = fstv(chunks); + tab[i] = row; + }; + + // Vector of indices [0, lines.size()) + size_t ls = lines.size(); + tab.resize(ls); + auto ir = boost::irange((size_t)0, ls); + vector indices(ir.begin(), ir.end()); + OMP_ALGO::for_each(indices.begin(), indices.end(), fill_line); + + return in; +} + +/** + * Infer the column types of the input table. It is assumed the + * table's rows are vector of strings. + */ +type_node_seq infer_column_types(const ITable& tab) +{ + vector::const_iterator rowit = tab.begin(); + + arity_t arity = rowit->size(); + type_node_seq types(arity, id::unknown_type); + + // Skip the first line, it might be a header... + // and that would confuse type inference. + if (tab.size() > 1) + ++rowit; + for (; rowit != tab.end(); ++rowit) + { + const string_seq& tokens = rowit->get_seq(); + for (arity_t i=0; iget_seq(); + + arity_t arity = row.size(); + + for (arity_t i=0; i& tokens, const type_node_seq& col_types) +{ + for (size_t i = 0; i < tokens.size(); i++) { + type_node flt = infer_type_from_token2(col_types[i], tokens[i]); + if ((id::enum_type == flt) && (id::enum_type != col_types[i])) + return true; + } + return false; +} + +/** + * Fill the input table only, given a DSV (delimiter-seperated values) + * file format, where delimiters are ',', ' ' or '\t'. + * + * This algorithm makes several passes over the data. First, it reads + * the entire table, as a collection of strings. Next, it tries to + * infer the column types, and the presence of a header. + */ +istream& istreamITable(istream& in, ITable& tab, + const vector& ignore_features) +{ + try { + istreamRawITable(in, tab); + } + catch (std::exception& e) { + istreamSparseITable(in, tab); + // Get rid of the unwanted columns. + tab.delete_columns(ignore_features); + return in; + } + + // Determine the column types. + type_node_seq col_types = infer_column_types(tab); + tab.set_types(col_types); + + // If there is a header row, then it must be the column labels. + if (has_header(tab, col_types)) { + tab.set_labels(tab.begin()->get_seq()); + tab.erase(tab.begin()); + } + + // Now that we have some column labels to work off of, + // Get rid of the unwanted columns. + tab.delete_columns(ignore_features); + + // Finally, perform a column type conversion + from_tokens_visitor ftv(tab.get_types()); + auto aft = apply_visitor(ftv); + OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(), + [&](multi_type_seq& seq) { + return aft(seq.get_variant()); + }); + + return in; +} + +/** + * Like istreamITable but add the option to ignore indices. + * + * It's akind of a temporary hack, till it's clear that this is much + * faster and we should recode istreamITable to ignore features + * head-on. + * + * Also, it assumes that the dataset is not sparse. + */ +istream& istreamITable_ignore_indices(istream& in, ITable& tab, + const vector& ignore_indices) +{ + istreamRawITable(in, tab, ignore_indices); + + // Determine the column types. + type_node_seq col_types = infer_column_types(tab); + tab.set_types(col_types); + + // If there is a header row, then it must be the column labels. + if (has_header(tab, col_types)) { + tab.set_labels(tab.begin()->get_seq()); + tab.erase(tab.begin()); + } + + // Finally, perform a column type conversion + from_tokens_visitor ftv(tab.get_types()); + auto aft = apply_visitor(ftv); + OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(), + [&](multi_type_seq& seq) { + return aft(seq.get_variant()); + }); + + return in; +} + +OTable loadOTable(const string& file_name, const string& target_feature) +{ + vector ignore_features; + for (const string& l : get_header(file_name)) + if (l != target_feature) + ignore_features.push_back(l); + + ITable itab = loadITable(file_name, ignore_features); + OTable res(itab.get_column_data(target_feature), target_feature); + return res; +} + +/** + * Take a line and return a triple with vector containing the input + * elements, output element and timestamp. + */ +std::tuple, string, string> +tokenizeRowIOT(const std::string& line, + const std::vector& ignored_indices, + int target_idx, // < 0 == ignored + int timestamp_idx) // < 0 == ignored +{ + std::tuple, string, string> res; + table_tokenizer toker = get_row_tokenizer(line); + int i = 0; + for (const std::string& tok : toker) { + if (!boost::binary_search(ignored_indices, i)) { + string el = boost::lexical_cast(tok); + if (target_idx == i) + std::get<1>(res) = el; + else if (timestamp_idx == i) + std::get<2>(res) = el; + else + std::get<0>(res).push_back(el); + } + i++; + } + return res; +} + +ITable loadITable(const string& file_name, + const vector& ignore_features) +{ + OC_ASSERT(!file_name.empty(), "the file name is empty"); + ifstream in(file_name.c_str()); + OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str()); + + ITable res; + istreamITable(in, res, ignore_features); + return res; +} + +/** + * Like loadITable but it is optimized by ignoring features head-on + * (rather than loading them, then removing them. + * + * WARNING: it assumes the dataset has a header!!! + */ +ITable loadITable_optimized(const string& file_name, + const vector& ignore_features) +{ + OC_ASSERT(!file_name.empty(), "the file name is empty"); + ifstream in(file_name.c_str()); + OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str()); + + // determined ignore_indices + vector ignore_indices = get_indices(ignore_features, + get_header(file_name)); + + ITable res; + istreamITable_ignore_indices(in, res, ignore_indices); + return res; +} + +/** + * Fill an input table and output table given a DSV + * (delimiter-seperated values) file format, where delimiters are ',', + * ' ' or '\t'. + * + * It is assumed that each row have the same number of columns, if not + * an assert is raised. + * + * pos specifies the position of the output, if -1 it is the last + * position. The default position is 0, the first column. + * + * This is only used for sparse table and could be optimized + */ +istream& istreamTable_OLD(istream& in, Table& tab, + const string& target_feature, + const vector& ignore_features) +{ + istreamITable(in, tab.itable, ignore_features); + + tab.otable = tab.itable.get_column_data(target_feature); + OC_ASSERT(0 != tab.otable.size(), + "Fatal Error: target feature \"%s\" not found", + target_feature.c_str()); + + tab.target_pos = tab.itable.get_column_offset(target_feature); + + type_node targ_type = tab.itable.get_type(target_feature); + + string targ_feat = tab.itable.delete_column(target_feature); + + tab.otable.set_label(targ_feat); + tab.otable.set_type(targ_type); + + return in; +} + +/** + * Like istreamTable but optimize by ignoring features head-on rather + * than loading them then removing them. + * + * Warning: only works on dense data with header file. + */ +istream& istreamTable_ignore_indices(istream& in, Table& tab, + const string& target_feature, + const vector& ignore_indices) +{ + istreamITable_ignore_indices(in, tab.itable, ignore_indices); + + tab.otable = tab.itable.get_column_data(target_feature); + OC_ASSERT(0 != tab.otable.size(), + "Fatal Error: target feature \"%s\" not found", + target_feature.c_str()); + + tab.target_pos = tab.itable.get_column_offset(target_feature); + + type_node targ_type = tab.itable.get_type(target_feature); + + string targ_feat = tab.itable.delete_column(target_feature); + + tab.otable.set_label(targ_feat); + tab.otable.set_type(targ_type); + + return in; +} + +// ================================================================== + +static istream& +inferTableAttributes(istream& in, const string& target_feature, + const string& timestamp_feature, + const vector& ignore_features, + type_tree& tt, bool& has_header, bool& is_sparse) +{ + // maxline is the maximum number of lines to read to infer the + // attributes. A negative number means reading all lines. + int maxline = 20; + streampos beg = in.tellg(); + + // Get a portion of the dataset into memory (cleaning weird stuff) + std::vector lines; + { + string line; + is_sparse = false; + while (get_data_line(in, line) && maxline-- > 0) { + // It is sparse + is_sparse = is_sparse || string::npos != line.find(sparse_delim); + if (is_sparse) { // just get out + // TODO could be simplified, optimized, etc + in.seekg(beg); + in.clear(); // in case it has reached the eof + return in; + } + + // put the line in a buffer + lines.push_back(line); + } + } + + // parse what could be a header + const vector maybe_header = tokenizeRow(lines.front()); + + // determine arity + arity_t arity = maybe_header.size(); + std::atomic arity_fail_row(-1); + + // determine initial type + type_node_seq types(arity, id::unknown_type); + + // parse the rest, determine its type and whether the arity is + // consistent + for (size_t i = 1; i < lines.size(); ++i) { + // Parse line + const string_seq& tokens = tokenizeRow(lines[i]); + + // Check arity + if (arity != (arity_t)tokens.size()) { + arity_fail_row = i + 1; + in.seekg(beg); + in.clear(); // in case it has reached the eof + OC_ASSERT(false, + "ERROR: Input file inconsistent: the %uth row has a " + "different number of columns than the rest of the file. " + "All rows should have the same number of columns.\n", + arity_fail_row.load()); + } + + // Infer type + boost::transform(types, tokens, types.begin(), + infer_type_from_token2); + } + + // Determine has_header + has_header = is_header(maybe_header, types); + + // Determine type signature + if (has_header) { + + // if unspecified, the target is the first column + unsigned target_idx = 0; + + // target feature will be ignored + if (!target_feature.empty()) { + auto target_it = std::find(maybe_header.begin(), maybe_header.end(), + target_feature); + OC_ASSERT(target_it != maybe_header.end(), "Target %s not found", + target_feature.c_str()); + target_idx = std::distance(maybe_header.begin(), target_it); + } + vector ignore_idxs = + get_indices(ignore_features, maybe_header); + ignore_idxs.push_back(target_idx); + boost::sort(ignore_idxs); + + // Include timestamp feature as idx to ignore + if (!timestamp_feature.empty()) { + auto timestamp_it = std::find(maybe_header.begin(), maybe_header.end(), + timestamp_feature); + OC_ASSERT(timestamp_it != maybe_header.end(), + "Timestamp feature %s not found", + timestamp_feature.c_str()); + unsigned timestamp_idx = std::distance(maybe_header.begin(), timestamp_it); + ignore_idxs.push_back(timestamp_idx); + boost::sort(ignore_idxs); + } + + // Generate type signature + type_node otype = types[target_idx]; + type_node_seq itypes; + for (unsigned i = 0; i < types.size(); ++i) + if (!boost::binary_search(ignore_idxs, i)) + itypes.push_back(types[i]); + tt = gen_signature(itypes, otype); + } else { + // No header, the target is the first column + type_node otype = types[0]; + types.erase(types.begin()); + tt = gen_signature(types, otype); + } + logger().debug() << "Infered type tree: " << tt; + + in.seekg(beg); + in.clear(); // in case it has reached the eof + return in; +} + +/** + * Perform 2 passes: + * + * 1) Infer + * 1.1) its type + * 1.2) whether it has a header + * 1.3) whether it is dense or sparse + * + * 2) Load the actual data. + */ +istream& istreamTable(istream& in, Table& tab, + const string& target_feature, + const string& timestamp_feature, + const vector& ignore_features) +{ + // Infer the properties of the table without loading its content + type_tree tt; + bool has_header, is_sparse; + streampos beg = in.tellg(); + inferTableAttributes(in, target_feature, timestamp_feature, + ignore_features, tt, has_header, is_sparse); + in.seekg(beg); + + if (is_sparse) { + // fallback on the old loader + // TODO: this could definitely be optimized + OC_ASSERT(timestamp_feature.empty(), "Timestamp feature not implemented"); + return istreamTable_OLD(in, tab, target_feature, ignore_features); + } else { + return istreamDenseTable(in, tab, target_feature, timestamp_feature, + ignore_features, tt, has_header); + } +} + +// ================================================================== + +/** + * Take a line and return a pair with vector containing the input + * elements and then output element. + */ +template +std::pair, T> +tokenizeRowIO( + const std::string& line, + const std::vector& ignored_indices=std::vector(), + unsigned target_idx=0) +{ + std::pair, T> res; + table_tokenizer toker = get_row_tokenizer(line); + size_t i = 0; + for (const std::string& tok : toker) { + if (!boost::binary_search(ignored_indices, i)) { + T el = boost::lexical_cast(tok); + if (target_idx == i) + res.second = el; + else + res.first.push_back(el); + } + i++; + } + return res; +} + +// ================================================================== + +static istream& +istreamDenseTable_noHeader(istream& in, Table& tab, + int target_idx, // < 0 == ignore + int timestamp_idx, // < 0 == ignore + const vector& ignore_idxs, + const type_tree& tt, bool has_header) +{ + // Get the entire dataset into memory (cleaning weird stuff) + string line; + std::vector lines; + while (get_data_line(in, line)) + lines.push_back(line); + + // Allocate all rows in the itable, otable and ttable + tab.itable.resize(lines.size()); + tab.otable.resize(lines.size()); + if (timestamp_idx >= 0) + tab.ttable.resize(lines.size()); + + // Get the elementary io types + type_node_seq itypes = + vector_comp(get_signature_inputs(tt), get_type_node); + type_node otype = get_type_node(get_signature_output(tt)); + + // Assign the io type to the table + tab.itable.set_types(itypes); + tab.otable.set_type(otype); + + // Instantiate type conversion for inputs + from_tokens_visitor ftv(itypes); + + // Function to parse each line (to be called in parallel) + auto parse_line = [&](unsigned i) { + try { + // Fill input + auto tokenIOT = tokenizeRowIOT(lines[i], ignore_idxs, + target_idx, timestamp_idx); + tab.itable[i] = ftv(std::get<0>(tokenIOT)); + + // Fill output + string output_str = std::get<1>(tokenIOT); + // If there is no valid target index, then there is no + // "output" column! + if ("" != output_str) + tab.otable[i] = token_to_vertex(otype, output_str); + + // Fill date + string date_str = std::get<2>(tokenIOT); + // If there is no valid timestamp index, then there is no + // "output" column! + if ("" != date_str) + tab.ttable[i] = TTable::from_string(date_str); + } + catch (AssertionException& ex) { + unsigned lineno = has_header? i+1 : i; + OC_ASSERT(false, "Parsing error occurred on line %d of input file\n" + "Exception: %s", lineno, ex.what()); + } + }; + + // Call it for each line in parallel + auto ir = boost::irange((size_t)0, lines.size()); + vector row_idxs(ir.begin(), ir.end()); + OMP_ALGO::for_each(row_idxs.begin(), row_idxs.end(), parse_line); + + // Assign the target position relative to the ignored indices + // (useful for writing that file back) + tab.target_pos = target_idx - boost::count_if(ignore_idxs, + arg1 < target_idx); + + if (timestamp_idx >= 0) + tab.timestamp_pos = timestamp_idx - + boost::count_if(ignore_idxs, arg1 < timestamp_idx); + + return in; +} + +istream& istreamDenseTable(istream& in, Table& tab, + const string& target_feature, + const string& timestamp_feature, + const vector& ignore_features, + const type_tree& tt, bool has_header) +{ + OC_ASSERT(has_header + || (target_feature.empty() + && ignore_features.empty() + && timestamp_feature.empty()), + "If the data file has no header, " + "then a target feature, ignore features or " + "timestamp_feature cannot be specified"); + + // determine target, timestamp and ignore indexes + int target_idx = 0; // if no header, target is at the first + // column by default + + int timestamp_idx = -1; // disabled by default + vector ignore_idxs; + if (has_header) { + string line; + get_data_line(in, line); + vector header = tokenizeRow(line); + + // Set target idx + if (!target_feature.empty()) { + auto target_it = std::find(header.begin(), header.end(), + target_feature); + OC_ASSERT(target_it != header.end(), "Target %s not found", + target_feature.c_str()); + target_idx = std::distance(header.begin(), target_it); + } + + // Set timestamp idx + if (!timestamp_feature.empty()) { + auto timestamp_it = std::find(header.begin(), header.end(), + timestamp_feature); + OC_ASSERT(timestamp_it != header.end(), "Timestamp feature %s not found", + timestamp_feature.c_str()); + timestamp_idx = std::distance(header.begin(), timestamp_it); + } + + // Set ignore idxs + ignore_idxs = get_indices(ignore_features, header); + + // get input and output labels from the header + auto iotlabels = tokenizeRowIOT(line, ignore_idxs, + target_idx, timestamp_idx); + tab.itable.set_labels(std::get<0>(iotlabels)); + tab.otable.set_label(std::get<1>(iotlabels)); + tab.ttable.set_label(std::get<2>(iotlabels)); + } + + return istreamDenseTable_noHeader(in, tab, target_idx, timestamp_idx, + ignore_idxs, tt, has_header); +} + +// ================================================================== + +// Parse a CompressedTable row +// TODO: implement timestamp support +CompressedTable::value_type parseCompressedTableRow(const type_tree& tt, const std::string& row_str) +{ + // split the string between input and output + unsigned end_outputs_pos = row_str.find("}"); + string outputs = row_str.substr(1, end_outputs_pos - 1), + inputs = row_str.substr(end_outputs_pos + 2); // +2 to go + // passed the + // following , + + // convert the inputs string into multi_type_seq + type_node_seq tns = vector_comp(get_signature_inputs(tt), get_type_node); + vector input_seq = tokenizeRow(inputs); + from_tokens_visitor ftv(tns); + multi_type_seq input_values = ftv(input_seq); + + // convert the outputs string into CompressedTable::counter_t + vector output_pair_seq = tokenizeRow(outputs); + CompressedTable::counter_t counter; + for (const string& pair_str : output_pair_seq) { + unsigned sep_pos = pair_str.find(":"); + string key_str = pair_str.substr(0, sep_pos), + value_str = pair_str.substr(sep_pos + 1); + vertex v = token_to_vertex(get_type_node(get_signature_output(tt)), + key_str); + count_t count = atof(value_str.c_str()); + counter[TimedValue(v)] = count; + } + return CompressedTable::value_type(input_values, counter); +} + +// WARNING: this implementation only supports boolean ctable!!!! +std::istream& istreamCompressedTable(std::istream& in, CompressedTable& ctable) +{ + //////////////// + // set header // + //////////////// + string header_line; + get_data_line(in, header_line); + auto labels = tokenizeRow(header_line); + ctable.set_labels(labels); + + //////////////////////// + // set type signature // + //////////////////////// + // HACK THIS PART TO MAKE IT SUPPORT OTHER TYPES THAN BOOLEAN + ctable.set_signature(gen_signature(id::boolean_type, ctable.get_arity())); + + ///////////////// + // set content // + ///////////////// + std::vector lines; + // read the entire file + { + string line; + while (get_data_line(in, line)) + lines.push_back(line); + } + // parse each line and fill the ctable + for (const string& line : lines) + ctable.insert(parseCompressedTableRow(ctable.get_signature(), line)); + + return in; +} + +Table loadTable(const std::string& file_name, + const std::string& target_feature, + const std::string& timestamp_feature, + const string_seq& ignore_features) +{ + OC_ASSERT(!file_name.empty(), "the file name is empty"); + ifstream in(file_name.c_str()); + OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str()); + + Table res; + istreamTable(in, res, target_feature, timestamp_feature, ignore_features); + return res; +} + +CompressedTable loadCompressedTable(const string& file_name) +{ + CompressedTable ctable; + OC_ASSERT(!file_name.empty(), "No filename specified!"); + ifstream in(file_name.c_str()); + istreamCompressedTable(in, ctable); + return ctable; +} + +// =========================================================== +// ostream regular tables + +void saveTable(const string& file_name, const Table& table) +{ + OC_ASSERT(!file_name.empty(), "No filename specified!"); + ofstream out(file_name.c_str()); + OC_ASSERT(out.is_open(), "Could not open %s", file_name.c_str()); + ostreamTable(out, table); +} + +// =========================================================== +// ostream CompressedTables + +ostream& ostreamCompressedTableHeader(ostream& out, const CompressedTable& ct) +{ + return ostreamln_container(out, ct.get_labels(), ","); +} + +ostream& ostreamCompressedTableRow(ostream& out, const CompressedTable::value_type& ctv) +{ + to_strings_visitor tsv; + auto ats = boost::apply_visitor(tsv); + // print map of outputs + out << "{"; + for(auto it = ctv.second.cbegin(); it != ctv.second.cend();) { + if (it->first.timestamp != boost::gregorian::date()) + out << "(" << table_fmt_vertex_to_str(it->first.value) + << "," << it->first.timestamp << "):" << it->second; + else + out << table_fmt_vertex_to_str(it->first.value) + << ":" << it->second; + if (++it != ctv.second.cend()) + out << ","; + } + out << "},"; + // print inputs + return ostreamln_container(out, ats(ctv.first.get_variant()), ","); +} + +ostream& ostreamCompressedTable(ostream& out, const CompressedTable& ct) +{ + // print header + ostreamCompressedTableHeader(out, ct); + // print data + for (const auto& v : ct) + ostreamCompressedTableRow(out, v); + + return out; +} + +ostream& ostreamCompressedTableTimeHeader(ostream& out, const CompressedTableTime& ctt) +{ + out << "timestamp,output" << endl; + return out; +} + +ostream& ostreamCompressedTableTimeRow(ostream& out, const CompressedTableTime::value_type& tio) +{ + out << tio.first << ",{"; + for (auto it = tio.second.cbegin(); it != tio.second.cend();) { + out << table_fmt_vertex_to_str(it->first) + << ":" << it->second; + if(++it != tio.second.cend()) + out << ","; + } + out << "}" << endl; + return out; +} + +ostream& ostreamCompressedTableTime(ostream& out, const CompressedTableTime& ctt) +{ + // print header + ostreamCompressedTableTimeHeader(out, ctt); + + // print data by time + for (const auto& tio : ctt) + ostreamCompressedTableTimeRow(out, tio); + + return out; +} + +// =========================================================== +// operator<< for the various tables and stuff. + +ostream& operator<<(ostream& out, const ITable& it) +{ + ostreamln_container(out, it.get_labels(), ","); + ostreamln_container(out, it.get_types(), ","); + to_strings_visitor tsv; + for (const auto& row : it) { + vector row_str = boost::apply_visitor(tsv, row.get_variant()); + ostreamln_container(out, row_str, ","); + } + return out; +} + +ostream& operator<<(ostream& out, const OTable& ot) +{ + if (!ot.get_label().empty()) + out << ot.get_label() << endl; + out << ot.get_type() << endl; + for (const vertex& v : ot) + out << table_fmt_vertex_to_str(v) << endl; + return out; +} + +ostream& operator<<(ostream& out, const Table& table) +{ + return ostreamTable(out, table); +} + +ostream& operator<<(ostream& out, const complete_truth_table& tt) +{ + return ostream_container(out, tt); +} + +ostream& operator<<(ostream& out, const CompressedTable& ct) +{ + return ostreamCompressedTable(out, ct); +} + +} // ~namespaces combo + +std::string oc_to_string(const combo::ITable& it, const std::string& indent) +{ + std::stringstream ss; + ss << it; + return ss.str(); +} + +std::string oc_to_string(const combo::OTable& ot, const std::string& indent) +{ + std::stringstream ss; + ss << ot; + return ss.str(); +} + +std::string oc_to_string(const combo::Table& table, const std::string& indent) +{ + std::stringstream ss; + ss << table; + return ss.str(); +} + +std::string oc_to_string(const combo::CompressedTable& ct, const std::string& indent) +{ + std::stringstream ss; + ss << ct; + return ss.str(); +} + +std::string oc_to_string(const combo::complete_truth_table& tt, + const std::string& indent) +{ + std::stringstream ss; + ss << tt; + return ss.str(); +} + +} // ~namespaces opencog diff --git a/opencog/persist/csv/table_io.h b/opencog/persist/csv/table_io.h new file mode 100644 index 0000000000..43d95635cb --- /dev/null +++ b/opencog/persist/csv/table_io.h @@ -0,0 +1,265 @@ +/** + * table_io.h --- + * + * Copyright (C) 2010 OpenCog Foundation + * Copyright (C) 2012 Poulin Holdings LLC + * + * Authors: Nil Geisweiller + * Linas Vepstas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + + +#ifndef _OPENCOG_TABLE_IO_H +#define _OPENCOG_TABLE_IO_H + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "table.h" +#include "opencog/asmoses/combo/type_checker/type_tree.h" + +namespace opencog { namespace combo { + +/** + * remove the carriage return (for DOS format) + */ +void removeCarriageReturn(std::string& str); + +/** + * remove non ASCII char at the begining of the string + */ +void removeNonASCII(std::string& str); + +/** + * Return true if the next chars in 'in' correspond to carriage return + * (support UNIX and DOS format) and advance in of the checked chars. + */ +bool checkCarriageReturn(std::istream& in); + +/** + * Convert strings to typed values + */ +builtin token_to_boolean(const std::string& token); +contin_t token_to_contin(const std::string& token); +vertex token_to_vertex(const type_node &tipe, const std::string& token); + + +// =========================================================== + +typedef boost::tokenizer> table_tokenizer; + +/** + * Take a row, return a tokenizer. Tokenization uses the + * separator characters comma, blank, tab (',', ' ' or '\t'). + */ +table_tokenizer get_row_tokenizer(const std::string& line); + +/** + * Take a line and return a vector containing the elements parsed. + * Used by istreamTable. + */ +template +static std::vector tokenizeRow( + const std::string& line, + const std::vector& ignored_indices=std::vector()) +{ + table_tokenizer tok = get_row_tokenizer(line); + std::vector res; + unsigned i = 0; + for (const std::string& t : tok) { + + // trim away whitespace padding; failing to do this + // confuses stuff downstream. + std::string clean(t); + boost::trim(clean); + + // Sometimes the tokenizer returns pure whitespace :-( + if (0 == clean.size()) continue; + + if (!boost::binary_search(ignored_indices, i++)) + res.push_back(boost::lexical_cast(clean)); + } + return res; +} + +// =========================================================== + +////////////////// +// istreamTable // +////////////////// + +// some hacky function to get the header of a DSV file (assuming there is one) +string_seq get_header(const std::string& input_file); + +std::istream& istreamRawITable( + std::istream& in, ITable& tab, + const std::vector& ignored_indices=std::vector()); + +std::istream& istreamITable(std::istream& in, ITable& tab, + const string_seq& ignore_features); + +std::istream& istreamTable(std::istream& in, Table& tab, + const std::string& target_feature, + const std::string& timestamp_feature, + const string_seq& ignore_features); + +// WARNING: this implementation only supports boolean ctable!!!! +std::istream& istreamCompressedTable(std::istream& in, CompressedTable& ctable); + +/** + * Load a OTable given the file name. Only works for dense DSV data. + */ +OTable loadOTable(const std::string& file_name, + const std::string& target_feature); + +// TODO: reimplement loadITable with the same model of loadTable and +// remove loadITable_optimized +ITable loadITable( + const std::string& file_name, + const string_seq& ignore_features=string_seq()); + +ITable loadITable_optimized( + const std::string& file_name, + const string_seq& ignore_features=string_seq()); + +/** + * If target_feature is empty then, in case there is no header, it is + * assumed to be the first feature. + */ +Table loadTable( + const std::string& file_name, + const std::string& target_feature=std::string(), + const std::string& timestamp_feature=std::string(), + const string_seq& ignore_features=string_seq()); + +std::istream& istreamDenseTable(std::istream& in, Table& tab, + const std::string& target_feature, + const std::string& timestamp_feature, + const string_seq& ignore_features, + const type_tree& tt, bool has_header); + +// WARNING: this implementation only supports boolean ctable!!!! +CompressedTable loadCompressedTable(const std::string& file_name); + +////////////////// +// ostreamTable // +////////////////// + +/// output the header of a data table in CSV format. +template +Out& ostreamTableHeader(Out& out, const Table& table) +{ + // Add input features in header + string_seq header = table.itable.get_labels(); + unsigned hsize = header.size(); + + // Add target feature in header + const std::string& ol = table.otable.get_label(); + header.insert(header.begin() + std::min(table.target_pos, hsize), ol); + + // Add timestamp feature in header + if (!table.ttable.empty()) { + const std::string& tl = table.ttable.get_label(); + header.insert(header.begin() + table.timestamp_pos, tl); + } + + // Write the header + ostream_container(out, header, ",") << std::endl; + return out; +} + +/// Output a data table in CSV format. Boolean values are output in +/// binary form (0 for false, 1 for true). +template +Out& ostreamTable(Out& out, const Table& table) +{ + // print header + ostreamTableHeader(out, table); + + // print data + unsigned isize = table.itable.size(), osize = table.otable.size(); + OC_ASSERT(table.itable.empty() || isize == osize); + for (size_t row = 0; row < osize; ++row) { + // Add input values + string_seq content; + if (!table.itable.empty()) + content = table.itable[row].to_strings(); + unsigned csize = content.size(); + + // Add target feature value + std::string oc = table_fmt_vertex_to_str(table.otable[row]); + content.insert(content.begin() + std::min(table.target_pos, csize), oc); + + // Add timestamp feature value + if (!table.ttable.empty()) { + std::string tc = TTable::to_string(table.ttable[row]); + content.insert(content.begin() + table.timestamp_pos, tc); + } + + // Write content row + ostream_container(out, content, ",") << std::endl; + } + return out; +} + +/// like above but take a table instead of a input and output table +void saveTable(const std::string& file_name, const Table& table); + +/// output a compressed table in pseudo CSV format +std::ostream& ostreamCompressedTableRow(std::ostream& out, const CompressedTable::value_type& ctv); +std::ostream& ostreamCompressedTable(std::ostream& out, const CompressedTable& ct); + +/// Output a compressed table with each row corresponding to a +/// timestamp, chronologically ordered. +std::ostream& ostreamCompressedTableTime(std::ostream& out, const CompressedTableTime& ctt); + +std::ostream& operator<<(std::ostream& out, const ITable& it); + +std::ostream& operator<<(std::ostream& out, const OTable& ot); + +std::ostream& operator<<(std::ostream& out, const Table& table); + +std::ostream& operator<<(std::ostream& out, const CompressedTable& ct); + +std::ostream& operator<<(std::ostream& out, const complete_truth_table& tt); + +} // ~namespaces combo + +// For pretty printing OpenCog objects while debugging, see +// https://wiki.opencog.org/w/Development_standards#Pretty_Print_OpenCog_Objects +std::string oc_to_string(const combo::ITable& it, + const std::string& indent=empty_string); +std::string oc_to_string(const combo::OTable& ot, + const std::string& indent=empty_string); +std::string oc_to_string(const combo::Table& table, + const std::string& indent=empty_string); +std::string oc_to_string(const combo::CompressedTable& ct, + const std::string& indent=empty_string); +std::string oc_to_string(const combo::complete_truth_table& tt, + const std::string& indent=empty_string); + +} // ~namespaces opencog + +#endif // _OPENCOG_TABLE_IO_H From 0c75df43b9d3bc7c29af10e9dffae327f557c5b0 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Tue, 2 Aug 2022 11:23:37 +0200 Subject: [PATCH 04/56] Cut down the original code to only the readers --- opencog/persist/csv/table_io.h | 265 ---------------- .../csv/{table_io.cc => table_read.cc} | 295 ++---------------- opencog/persist/csv/table_read.h | 143 +++++++++ 3 files changed, 167 insertions(+), 536 deletions(-) delete mode 100644 opencog/persist/csv/table_io.h rename opencog/persist/csv/{table_io.cc => table_read.cc} (84%) create mode 100644 opencog/persist/csv/table_read.h diff --git a/opencog/persist/csv/table_io.h b/opencog/persist/csv/table_io.h deleted file mode 100644 index 43d95635cb..0000000000 --- a/opencog/persist/csv/table_io.h +++ /dev/null @@ -1,265 +0,0 @@ -/** - * table_io.h --- - * - * Copyright (C) 2010 OpenCog Foundation - * Copyright (C) 2012 Poulin Holdings LLC - * - * Authors: Nil Geisweiller - * Linas Vepstas - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License v3 as - * published by the Free Software Foundation and including the exceptions - * at http://opencog.org/wiki/Licenses - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program; if not, write to: - * Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - - -#ifndef _OPENCOG_TABLE_IO_H -#define _OPENCOG_TABLE_IO_H - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "table.h" -#include "opencog/asmoses/combo/type_checker/type_tree.h" - -namespace opencog { namespace combo { - -/** - * remove the carriage return (for DOS format) - */ -void removeCarriageReturn(std::string& str); - -/** - * remove non ASCII char at the begining of the string - */ -void removeNonASCII(std::string& str); - -/** - * Return true if the next chars in 'in' correspond to carriage return - * (support UNIX and DOS format) and advance in of the checked chars. - */ -bool checkCarriageReturn(std::istream& in); - -/** - * Convert strings to typed values - */ -builtin token_to_boolean(const std::string& token); -contin_t token_to_contin(const std::string& token); -vertex token_to_vertex(const type_node &tipe, const std::string& token); - - -// =========================================================== - -typedef boost::tokenizer> table_tokenizer; - -/** - * Take a row, return a tokenizer. Tokenization uses the - * separator characters comma, blank, tab (',', ' ' or '\t'). - */ -table_tokenizer get_row_tokenizer(const std::string& line); - -/** - * Take a line and return a vector containing the elements parsed. - * Used by istreamTable. - */ -template -static std::vector tokenizeRow( - const std::string& line, - const std::vector& ignored_indices=std::vector()) -{ - table_tokenizer tok = get_row_tokenizer(line); - std::vector res; - unsigned i = 0; - for (const std::string& t : tok) { - - // trim away whitespace padding; failing to do this - // confuses stuff downstream. - std::string clean(t); - boost::trim(clean); - - // Sometimes the tokenizer returns pure whitespace :-( - if (0 == clean.size()) continue; - - if (!boost::binary_search(ignored_indices, i++)) - res.push_back(boost::lexical_cast(clean)); - } - return res; -} - -// =========================================================== - -////////////////// -// istreamTable // -////////////////// - -// some hacky function to get the header of a DSV file (assuming there is one) -string_seq get_header(const std::string& input_file); - -std::istream& istreamRawITable( - std::istream& in, ITable& tab, - const std::vector& ignored_indices=std::vector()); - -std::istream& istreamITable(std::istream& in, ITable& tab, - const string_seq& ignore_features); - -std::istream& istreamTable(std::istream& in, Table& tab, - const std::string& target_feature, - const std::string& timestamp_feature, - const string_seq& ignore_features); - -// WARNING: this implementation only supports boolean ctable!!!! -std::istream& istreamCompressedTable(std::istream& in, CompressedTable& ctable); - -/** - * Load a OTable given the file name. Only works for dense DSV data. - */ -OTable loadOTable(const std::string& file_name, - const std::string& target_feature); - -// TODO: reimplement loadITable with the same model of loadTable and -// remove loadITable_optimized -ITable loadITable( - const std::string& file_name, - const string_seq& ignore_features=string_seq()); - -ITable loadITable_optimized( - const std::string& file_name, - const string_seq& ignore_features=string_seq()); - -/** - * If target_feature is empty then, in case there is no header, it is - * assumed to be the first feature. - */ -Table loadTable( - const std::string& file_name, - const std::string& target_feature=std::string(), - const std::string& timestamp_feature=std::string(), - const string_seq& ignore_features=string_seq()); - -std::istream& istreamDenseTable(std::istream& in, Table& tab, - const std::string& target_feature, - const std::string& timestamp_feature, - const string_seq& ignore_features, - const type_tree& tt, bool has_header); - -// WARNING: this implementation only supports boolean ctable!!!! -CompressedTable loadCompressedTable(const std::string& file_name); - -////////////////// -// ostreamTable // -////////////////// - -/// output the header of a data table in CSV format. -template -Out& ostreamTableHeader(Out& out, const Table& table) -{ - // Add input features in header - string_seq header = table.itable.get_labels(); - unsigned hsize = header.size(); - - // Add target feature in header - const std::string& ol = table.otable.get_label(); - header.insert(header.begin() + std::min(table.target_pos, hsize), ol); - - // Add timestamp feature in header - if (!table.ttable.empty()) { - const std::string& tl = table.ttable.get_label(); - header.insert(header.begin() + table.timestamp_pos, tl); - } - - // Write the header - ostream_container(out, header, ",") << std::endl; - return out; -} - -/// Output a data table in CSV format. Boolean values are output in -/// binary form (0 for false, 1 for true). -template -Out& ostreamTable(Out& out, const Table& table) -{ - // print header - ostreamTableHeader(out, table); - - // print data - unsigned isize = table.itable.size(), osize = table.otable.size(); - OC_ASSERT(table.itable.empty() || isize == osize); - for (size_t row = 0; row < osize; ++row) { - // Add input values - string_seq content; - if (!table.itable.empty()) - content = table.itable[row].to_strings(); - unsigned csize = content.size(); - - // Add target feature value - std::string oc = table_fmt_vertex_to_str(table.otable[row]); - content.insert(content.begin() + std::min(table.target_pos, csize), oc); - - // Add timestamp feature value - if (!table.ttable.empty()) { - std::string tc = TTable::to_string(table.ttable[row]); - content.insert(content.begin() + table.timestamp_pos, tc); - } - - // Write content row - ostream_container(out, content, ",") << std::endl; - } - return out; -} - -/// like above but take a table instead of a input and output table -void saveTable(const std::string& file_name, const Table& table); - -/// output a compressed table in pseudo CSV format -std::ostream& ostreamCompressedTableRow(std::ostream& out, const CompressedTable::value_type& ctv); -std::ostream& ostreamCompressedTable(std::ostream& out, const CompressedTable& ct); - -/// Output a compressed table with each row corresponding to a -/// timestamp, chronologically ordered. -std::ostream& ostreamCompressedTableTime(std::ostream& out, const CompressedTableTime& ctt); - -std::ostream& operator<<(std::ostream& out, const ITable& it); - -std::ostream& operator<<(std::ostream& out, const OTable& ot); - -std::ostream& operator<<(std::ostream& out, const Table& table); - -std::ostream& operator<<(std::ostream& out, const CompressedTable& ct); - -std::ostream& operator<<(std::ostream& out, const complete_truth_table& tt); - -} // ~namespaces combo - -// For pretty printing OpenCog objects while debugging, see -// https://wiki.opencog.org/w/Development_standards#Pretty_Print_OpenCog_Objects -std::string oc_to_string(const combo::ITable& it, - const std::string& indent=empty_string); -std::string oc_to_string(const combo::OTable& ot, - const std::string& indent=empty_string); -std::string oc_to_string(const combo::Table& table, - const std::string& indent=empty_string); -std::string oc_to_string(const combo::CompressedTable& ct, - const std::string& indent=empty_string); -std::string oc_to_string(const combo::complete_truth_table& tt, - const std::string& indent=empty_string); - -} // ~namespaces opencog - -#endif // _OPENCOG_TABLE_IO_H diff --git a/opencog/persist/csv/table_io.cc b/opencog/persist/csv/table_read.cc similarity index 84% rename from opencog/persist/csv/table_io.cc rename to opencog/persist/csv/table_read.cc index 1f80c8fdc6..2d2181449e 100644 --- a/opencog/persist/csv/table_io.cc +++ b/opencog/persist/csv/table_read.cc @@ -1,7 +1,8 @@ -/** table_io.cc --- +/** table_read.cc -- * * Copyright (C) 2010 OpenCog Foundation * Copyright (C) 2012 Poulin Holdings LLC + * Copyright (C) 2022 Linas Vepstas * * Authors: Nil Geisweiller * Linas Vepstas @@ -41,10 +42,9 @@ #include #include -#include "table.h" -#include "table_io.h" +#include "table_read.h" -namespace opencog { namespace combo { +namespace opencog { using namespace std; using namespace boost; @@ -125,7 +125,7 @@ static const char *sparse_delim = " : "; /** * parse a pair of key/value in a parse dataset, using ':' as * delimiter. For instance - * + * * parse_key_val("key : val") * * returns @@ -148,7 +148,7 @@ parse_key_val(string chunk) boost::trim(val); return {key, val}; } - + /** * Take a row, return a tokenizer. Tokenization uses the * separator characters comma, blank, tab (',', ' ' or '\t'). @@ -231,7 +231,7 @@ type_node infer_type_from_token(const string& token) * Compare this to 'curr_guess', and upgrade the type inference * if it can be done consistently. */ -static type_node +static type_node infer_type_from_token2(type_node curr_guess, const string& token) { type_node tokt = infer_type_from_token(token); @@ -508,21 +508,21 @@ struct from_sparse_tokens_visitor : public from_tokens_visitor * The sparse table format consists of some fixed number of columns, * in comma-separated format, followed by key-value pairs, also * tab-separated. viz: - * + * * val, val, val, name:val, name:val, name:val - * - * Thus, for example, a row such as - * + * + * Thus, for example, a row such as + * * earn, issued : 1, results : 2, ending : 1, including : 1 - * + * * indicates that there one fixed column, of enum type, (the enum value - * being "earn"), and that features called "issued", "ending" and + * being "earn"), and that features called "issued", "ending" and * "including" have a contin value of 1.0 and "results" has a contin * value of 2. - * + * * The routine does NOT store the table in sparse format: it stores the * full, exploded table. This could be bad ... - * TODO: we really need a sparse table format, as well. + * TODO: we really need a sparse table format, as well. * * The "Raw" format has all data as strings; type conversion to the * appropriate type, must all be done as a separate step. @@ -556,7 +556,7 @@ istream& istreamSparseITable(istream& in, ITable& tab) if (0 == fixed_arity) { vector fixy = tokenizeSparseRow(lines[0]); // count commas, until a semi-colon is found. - while (string::npos == fixy[fixed_arity].find(sparse_delim)) + while (string::npos == fixy[fixed_arity].find(sparse_delim)) fixed_arity++; } logger().info() << "Sparse file fixed column count=" << fixed_arity; @@ -762,20 +762,8 @@ istream& istreamITable_ignore_indices(istream& in, ITable& tab, [&](multi_type_seq& seq) { return aft(seq.get_variant()); }); - - return in; -} - -OTable loadOTable(const string& file_name, const string& target_feature) -{ - vector ignore_features; - for (const string& l : get_header(file_name)) - if (l != target_feature) - ignore_features.push_back(l); - ITable itab = loadITable(file_name, ignore_features); - OTable res(itab.get_column_data(target_feature), target_feature); - return res; + return in; } /** @@ -834,7 +822,7 @@ ITable loadITable_optimized(const string& file_name, // determined ignore_indices vector ignore_indices = get_indices(ignore_features, get_header(file_name)); - + ITable res; istreamITable_ignore_indices(in, res, ignore_indices); return res; @@ -860,12 +848,12 @@ istream& istreamTable_OLD(istream& in, Table& tab, istreamITable(in, tab.itable, ignore_features); tab.otable = tab.itable.get_column_data(target_feature); - OC_ASSERT(0 != tab.otable.size(), + OC_ASSERT(0 != tab.otable.size(), "Fatal Error: target feature \"%s\" not found", target_feature.c_str()); tab.target_pos = tab.itable.get_column_offset(target_feature); - + type_node targ_type = tab.itable.get_type(target_feature); string targ_feat = tab.itable.delete_column(target_feature); @@ -885,16 +873,16 @@ istream& istreamTable_OLD(istream& in, Table& tab, istream& istreamTable_ignore_indices(istream& in, Table& tab, const string& target_feature, const vector& ignore_indices) -{ +{ istreamITable_ignore_indices(in, tab.itable, ignore_indices); tab.otable = tab.itable.get_column_data(target_feature); - OC_ASSERT(0 != tab.otable.size(), + OC_ASSERT(0 != tab.otable.size(), "Fatal Error: target feature \"%s\" not found", target_feature.c_str()); tab.target_pos = tab.itable.get_column_offset(target_feature); - + type_node targ_type = tab.itable.get_type(target_feature); string targ_feat = tab.itable.delete_column(target_feature); @@ -1067,7 +1055,7 @@ istream& istreamTable(istream& in, Table& tab, */ template std::pair, T> -tokenizeRowIO( +tokenizeRowIO ( const std::string& line, const std::vector& ignored_indices=std::vector(), unsigned target_idx=0) @@ -1227,72 +1215,6 @@ istream& istreamDenseTable(istream& in, Table& tab, // ================================================================== -// Parse a CompressedTable row -// TODO: implement timestamp support -CompressedTable::value_type parseCompressedTableRow(const type_tree& tt, const std::string& row_str) -{ - // split the string between input and output - unsigned end_outputs_pos = row_str.find("}"); - string outputs = row_str.substr(1, end_outputs_pos - 1), - inputs = row_str.substr(end_outputs_pos + 2); // +2 to go - // passed the - // following , - - // convert the inputs string into multi_type_seq - type_node_seq tns = vector_comp(get_signature_inputs(tt), get_type_node); - vector input_seq = tokenizeRow(inputs); - from_tokens_visitor ftv(tns); - multi_type_seq input_values = ftv(input_seq); - - // convert the outputs string into CompressedTable::counter_t - vector output_pair_seq = tokenizeRow(outputs); - CompressedTable::counter_t counter; - for (const string& pair_str : output_pair_seq) { - unsigned sep_pos = pair_str.find(":"); - string key_str = pair_str.substr(0, sep_pos), - value_str = pair_str.substr(sep_pos + 1); - vertex v = token_to_vertex(get_type_node(get_signature_output(tt)), - key_str); - count_t count = atof(value_str.c_str()); - counter[TimedValue(v)] = count; - } - return CompressedTable::value_type(input_values, counter); -} - -// WARNING: this implementation only supports boolean ctable!!!! -std::istream& istreamCompressedTable(std::istream& in, CompressedTable& ctable) -{ - //////////////// - // set header // - //////////////// - string header_line; - get_data_line(in, header_line); - auto labels = tokenizeRow(header_line); - ctable.set_labels(labels); - - //////////////////////// - // set type signature // - //////////////////////// - // HACK THIS PART TO MAKE IT SUPPORT OTHER TYPES THAN BOOLEAN - ctable.set_signature(gen_signature(id::boolean_type, ctable.get_arity())); - - ///////////////// - // set content // - ///////////////// - std::vector lines; - // read the entire file - { - string line; - while (get_data_line(in, line)) - lines.push_back(line); - } - // parse each line and fill the ctable - for (const string& line : lines) - ctable.insert(parseCompressedTableRow(ctable.get_signature(), line)); - - return in; -} - Table loadTable(const std::string& file_name, const std::string& target_feature, const std::string& timestamp_feature, @@ -1307,173 +1229,4 @@ Table loadTable(const std::string& file_name, return res; } -CompressedTable loadCompressedTable(const string& file_name) -{ - CompressedTable ctable; - OC_ASSERT(!file_name.empty(), "No filename specified!"); - ifstream in(file_name.c_str()); - istreamCompressedTable(in, ctable); - return ctable; -} - -// =========================================================== -// ostream regular tables - -void saveTable(const string& file_name, const Table& table) -{ - OC_ASSERT(!file_name.empty(), "No filename specified!"); - ofstream out(file_name.c_str()); - OC_ASSERT(out.is_open(), "Could not open %s", file_name.c_str()); - ostreamTable(out, table); -} - -// =========================================================== -// ostream CompressedTables - -ostream& ostreamCompressedTableHeader(ostream& out, const CompressedTable& ct) -{ - return ostreamln_container(out, ct.get_labels(), ","); -} - -ostream& ostreamCompressedTableRow(ostream& out, const CompressedTable::value_type& ctv) -{ - to_strings_visitor tsv; - auto ats = boost::apply_visitor(tsv); - // print map of outputs - out << "{"; - for(auto it = ctv.second.cbegin(); it != ctv.second.cend();) { - if (it->first.timestamp != boost::gregorian::date()) - out << "(" << table_fmt_vertex_to_str(it->first.value) - << "," << it->first.timestamp << "):" << it->second; - else - out << table_fmt_vertex_to_str(it->first.value) - << ":" << it->second; - if (++it != ctv.second.cend()) - out << ","; - } - out << "},"; - // print inputs - return ostreamln_container(out, ats(ctv.first.get_variant()), ","); -} - -ostream& ostreamCompressedTable(ostream& out, const CompressedTable& ct) -{ - // print header - ostreamCompressedTableHeader(out, ct); - // print data - for (const auto& v : ct) - ostreamCompressedTableRow(out, v); - - return out; -} - -ostream& ostreamCompressedTableTimeHeader(ostream& out, const CompressedTableTime& ctt) -{ - out << "timestamp,output" << endl; - return out; -} - -ostream& ostreamCompressedTableTimeRow(ostream& out, const CompressedTableTime::value_type& tio) -{ - out << tio.first << ",{"; - for (auto it = tio.second.cbegin(); it != tio.second.cend();) { - out << table_fmt_vertex_to_str(it->first) - << ":" << it->second; - if(++it != tio.second.cend()) - out << ","; - } - out << "}" << endl; - return out; -} - -ostream& ostreamCompressedTableTime(ostream& out, const CompressedTableTime& ctt) -{ - // print header - ostreamCompressedTableTimeHeader(out, ctt); - - // print data by time - for (const auto& tio : ctt) - ostreamCompressedTableTimeRow(out, tio); - - return out; -} - -// =========================================================== -// operator<< for the various tables and stuff. - -ostream& operator<<(ostream& out, const ITable& it) -{ - ostreamln_container(out, it.get_labels(), ","); - ostreamln_container(out, it.get_types(), ","); - to_strings_visitor tsv; - for (const auto& row : it) { - vector row_str = boost::apply_visitor(tsv, row.get_variant()); - ostreamln_container(out, row_str, ","); - } - return out; -} - -ostream& operator<<(ostream& out, const OTable& ot) -{ - if (!ot.get_label().empty()) - out << ot.get_label() << endl; - out << ot.get_type() << endl; - for (const vertex& v : ot) - out << table_fmt_vertex_to_str(v) << endl; - return out; -} - -ostream& operator<<(ostream& out, const Table& table) -{ - return ostreamTable(out, table); -} - -ostream& operator<<(ostream& out, const complete_truth_table& tt) -{ - return ostream_container(out, tt); -} - -ostream& operator<<(ostream& out, const CompressedTable& ct) -{ - return ostreamCompressedTable(out, ct); -} - -} // ~namespaces combo - -std::string oc_to_string(const combo::ITable& it, const std::string& indent) -{ - std::stringstream ss; - ss << it; - return ss.str(); -} - -std::string oc_to_string(const combo::OTable& ot, const std::string& indent) -{ - std::stringstream ss; - ss << ot; - return ss.str(); -} - -std::string oc_to_string(const combo::Table& table, const std::string& indent) -{ - std::stringstream ss; - ss << table; - return ss.str(); -} - -std::string oc_to_string(const combo::CompressedTable& ct, const std::string& indent) -{ - std::stringstream ss; - ss << ct; - return ss.str(); -} - -std::string oc_to_string(const combo::complete_truth_table& tt, - const std::string& indent) -{ - std::stringstream ss; - ss << tt; - return ss.str(); -} - } // ~namespaces opencog diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h new file mode 100644 index 0000000000..b5ac544a50 --- /dev/null +++ b/opencog/persist/csv/table_read.h @@ -0,0 +1,143 @@ +/** + * table_read.h -- Read a CSV/TSV table + * + * Copyright (C) 2010 OpenCog Foundation + * Copyright (C) 2012 Poulin Holdings LLC + * Copyright (C) 2022 Linas Vepstas + * + * Authors: Nil Geisweiller + * Linas Vepstas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _ATOMESE_TABLE_READ_H +#define _ATOMESE_TABLE_READ_H + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace opencog { + +/** + * remove the carriage return (for DOS format) + */ +void removeCarriageReturn(std::string& str); + +/** + * remove non ASCII char at the begining of the string + */ +void removeNonASCII(std::string& str); + +/** + * Return true if the next chars in 'in' correspond to carriage return + * (support UNIX and DOS format) and advance in of the checked chars. + */ +bool checkCarriageReturn(std::istream& in); + +/** + * Convert strings to typed values + */ +builtin token_to_boolean(const std::string& token); +contin_t token_to_contin(const std::string& token); +vertex token_to_vertex(const type_node &tipe, const std::string& token); + + +// =========================================================== + +typedef boost::tokenizer> table_tokenizer; + +/** + * Take a row, return a tokenizer. Tokenization uses the + * separator characters comma, blank, tab (',', ' ' or '\t'). + */ +table_tokenizer get_row_tokenizer(const std::string& line); + +/** + * Take a line and return a vector containing the elements parsed. + */ +template +static std::vector tokenizeRow ( + const std::string& line, + const std::vector& ignored_indices=std::vector()) +{ + table_tokenizer tok = get_row_tokenizer(line); + std::vector res; + unsigned i = 0; + for (const std::string& t : tok) { + + // trim away whitespace padding; failing to do this + // confuses stuff downstream. + std::string clean(t); + boost::trim(clean); + + // Sometimes the tokenizer returns pure whitespace :-( + if (0 == clean.size()) continue; + + if (!boost::binary_search(ignored_indices, i++)) + res.push_back(boost::lexical_cast(clean)); + } + return res; +} + +// =========================================================== + +// Get the header of a DSV file (assuming there is one) +string_seq get_header(const std::string& input_file); + +std::istream& istreamRawITable( + std::istream& in, ITable& tab, + const std::vector& ignored_indices=std::vector()); + +std::istream& istreamITable(std::istream& in, ITable& tab, + const string_seq& ignore_features); + +std::istream& istreamTable(std::istream& in, Table& tab, + const string_seq& ignore_features); + +// TODO: reimplement loadITable with the same model of loadTable and +// remove loadITable_optimized +ITable loadITable( + const std::string& file_name, + const string_seq& ignore_features=string_seq()); + +ITable loadITable_optimized( + const std::string& file_name, + const string_seq& ignore_features=string_seq()); + +/** + * If target_feature is empty then, in case there is no header, it is + * assumed to be the first feature. + */ +Table loadTable( + const std::string& file_name, + const string_seq& ignore_features=string_seq()); + +std::istream& istreamDenseTable(std::istream& in, Table& tab, + const string_seq& ignore_features, + const type_tree& tt, bool has_header); + + +} // ~namespaces opencog + +#endif // _ATOMESE_TABLE_READ_H From 345de2bb2ac58ffc735ebd2230bf2a6603c144b1 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 12:25:28 +0300 Subject: [PATCH 05/56] Add Makefile. --- opencog/persist/CMakeLists.txt | 1 + opencog/persist/csv/CMakeLists.txt | 25 +++++++++++++++++++++++++ opencog/persist/csv/load_csv.cc | 4 ++-- 3 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 opencog/persist/csv/CMakeLists.txt diff --git a/opencog/persist/CMakeLists.txt b/opencog/persist/CMakeLists.txt index 9fd966d7bb..ea24c420ad 100644 --- a/opencog/persist/CMakeLists.txt +++ b/opencog/persist/CMakeLists.txt @@ -1,5 +1,6 @@ ADD_SUBDIRECTORY (storage) ADD_SUBDIRECTORY (api) +ADD_SUBDIRECTORY (csv) IF (HAVE_GEARMAN AND HAVE_GUILE) ADD_SUBDIRECTORY (gearman) diff --git a/opencog/persist/csv/CMakeLists.txt b/opencog/persist/csv/CMakeLists.txt new file mode 100644 index 0000000000..d358336b58 --- /dev/null +++ b/opencog/persist/csv/CMakeLists.txt @@ -0,0 +1,25 @@ + +# Generic JSON decoding. +ADD_LIBRARY (csv + load_csv.cc + table_read.cc +) + +ADD_DEPENDENCIES(csv opencog_atom_types) + +TARGET_LINK_LIBRARIES(csv + atomspace + atombase + ${COGUTIL_LIBRARY} +) + +INSTALL (TARGETS csv EXPORT AtomSpaceTargets + DESTINATION "lib${LIB_DIR_SUFFIX}/opencog" +) + +INSTALL (FILES + load_csv.h + DESTINATION "include/opencog/persist/csv" +) + +# ------------------------------- diff --git a/opencog/persist/csv/load_csv.cc b/opencog/persist/csv/load_csv.cc index f69885c879..56405b3348 100644 --- a/opencog/persist/csv/load_csv.cc +++ b/opencog/persist/csv/load_csv.cc @@ -1,4 +1,4 @@ -/** +/** * load_csv.cc -- Load CSV tables into Values * * Copyright (C) 2022 Linas Vepstas @@ -25,7 +25,7 @@ using namespace opencog; * Load columns from a CSV file and place them into Atomese Values on * the indicated Atom. Atomese Values are vectors (of floats, bools, * srings, or more complex structures). Each Value holds one column - * from the dataset. + * from the dataset. * * The features (columns) specified in ignore_features will be omitted * from the representation. From ca52b37e4e2453af9722aa8a6b5878728dc5f768 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 12:36:28 +0300 Subject: [PATCH 06/56] Include AtomSpace --- opencog/persist/csv/load_csv.cc | 5 +++++ opencog/persist/csv/load_csv.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/opencog/persist/csv/load_csv.cc b/opencog/persist/csv/load_csv.cc index 56405b3348..6c55f5f000 100644 --- a/opencog/persist/csv/load_csv.cc +++ b/opencog/persist/csv/load_csv.cc @@ -19,6 +19,11 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +#include + +#include +#include "load_csv.h" + using namespace opencog; /** diff --git a/opencog/persist/csv/load_csv.h b/opencog/persist/csv/load_csv.h index a28a0b56ae..f073d6336b 100644 --- a/opencog/persist/csv/load_csv.h +++ b/opencog/persist/csv/load_csv.h @@ -25,6 +25,8 @@ #ifndef _ATOMESE_LOAD_CSV_H #define _ATOMESE_LOAD_CSV_H +#include + namespace opencog { // Load columns from a CSV file and place them into Atomese Values on From 7a3e3cf3937ed7c2573d7bf2cc9a3a1b792f3cbd Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 12:59:06 +0300 Subject: [PATCH 07/56] Convert bool and contin types to Values --- opencog/persist/csv/table_read.cc | 96 ++++++++++++++----------------- opencog/persist/csv/table_read.h | 8 ++- 2 files changed, 48 insertions(+), 56 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 2d2181449e..1006e20b60 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -42,6 +42,10 @@ #include #include +#include +#include +#include + #include "table_read.h" namespace opencog { @@ -259,64 +263,50 @@ infer_type_from_token2(type_node curr_guess, const string& token) } /// cast string "token" to a vertex of type "tipe" -builtin token_to_boolean(const string& token) +ValuePtr token_to_boolean(const string& token) { - if ("0" == token || "F" == token || "f" == token) - return id::logical_false; - else if ("1" == token || "T" == token || "t" == token) - return id::logical_true; - else { - OC_ASSERT(false, "Expecting boolean value, got %s", token.c_str()); - return builtin(); - } + if ("0" == token || "F" == token || "f" == token) + return createBoolValue(false); + + if ("1" == token || "T" == token || "t" == token) + return createBoolValue(true); + + throw RuntimeError(TRACE_INFO, + "Expecting boolean value, got %s", token.c_str()); } -contin_t token_to_contin(const string& token) + +ValuePtr token_to_contin(const string& token) { - try { - return lexical_cast(token); - } catch(boost::bad_lexical_cast&) { - OC_ASSERT(false, "Could not cast %s to contin", token.c_str()); - return contin_t(); - } + try { + return createFloatValue(lexical_cast(token)); + } catch (boost::bad_lexical_cast&) { + throw RuntimeError(TRACE_INFO, + "Could not cast %s to floating point", token.c_str()); + } } -vertex token_to_vertex(const type_node &tipe, const string& token) -{ - switch (tipe) { - - case id::boolean_type: - return token_to_boolean(token); - - case id::contin_type: - return token_to_contin(token); - - case id::enum_type: - // Enum types must begin with an alpha character - if (isalpha(token[0])) - return enum_t(token); - OC_ASSERT(false, "Enum type must begin with alphabetic char, but %s doesn't", token.c_str()); - break; - - case id::definite_object_type: - return token; - break; - - // Ugly hack ... the problem adressed here is that feature - // selection has to read and propagate columns of unknown type - // (typically, dates, times). So we hack around this here. - case id::ill_formed_type: - return enum_t(token); - // return id::ill_formed_type; - // return id::null_vertex; - break; - - default: - stringstream ss; - ss << "Unable to convert token \"" << token << "\" to type=" << tipe << endl; - OC_ASSERT(0, ss.str().c_str()); - } - // unreachable - return id::null_vertex; +ValuePtr token_to_vertex(Type tipe, const std::string& token) +{ + if (BOOL_VALUE == tipe) + return token_to_boolean(token); + + if (FLOAT_VALUE == tipe) + return token_to_contin(token); + + if (STRING_VALUE == tipe) + { + // Enum types must begin with an alpha character + if (isalpha(token[0])) + return createStringValue(token); + + throw RuntimeError(TRACE_INFO, + "Enum type must begin with alphabetic char, but %s doesn't", + token.c_str()); + } + + stringstream ss; + ss << "Unable to convert token \"" << token << "\" to type=" << tipe << endl; + throw RuntimeError(TRACE_INFO, "%s", ss.str().c_str()); } // =========================================================== diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index b5ac544a50..0030997c86 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -37,6 +37,8 @@ #include #include +#include + namespace opencog { /** @@ -58,9 +60,9 @@ bool checkCarriageReturn(std::istream& in); /** * Convert strings to typed values */ -builtin token_to_boolean(const std::string& token); -contin_t token_to_contin(const std::string& token); -vertex token_to_vertex(const type_node &tipe, const std::string& token); +ValuePtr token_to_boolean(const std::string&); +ValuePtr token_to_contin(const std::string&); +ValuePtr token_to_vertex(Type, const std::string&); // =========================================================== From c1e78242dfbe503440c58b9c06811999948b72e9 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 13:05:21 +0300 Subject: [PATCH 08/56] Define what string_seq is --- opencog/persist/csv/table_read.cc | 13 ++++++------- opencog/persist/csv/table_read.h | 1 + 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 1006e20b60..7d9c8418a7 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -50,7 +50,6 @@ namespace opencog { -using namespace std; using namespace boost; using namespace boost::phoenix; using boost::phoenix::arg_names::arg1; @@ -105,9 +104,9 @@ bool is_comment(const char c) // // The signature of this routine is the same as std:getline() // -istream &get_data_line(istream& is, string& line) +std::istream& get_data_line(std::istream& is, std::string& line) { - while (1) + while (true) { getline(is, line); if (!is) return is; @@ -382,12 +381,12 @@ istream& istreamRawITable(istream& in, ITable& tab, return in; } -vector get_header(const string& file_name) +std::vector get_header(const std::string& file_name) { - ifstream in(file_name.c_str()); - string line; + std::ifstream in(file_name.c_str()); + std::string line; get_data_line(in, line); - return tokenizeRow(line); + return tokenizeRow(line); } // =========================================================== diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index 0030997c86..46c5f72c7b 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -103,6 +103,7 @@ static std::vector tokenizeRow ( } // =========================================================== +typedef std::vector string_seq; // Get the header of a DSV file (assuming there is one) string_seq get_header(const std::string& input_file); From 1e311f7272e620887f2c8d296d771d363697c975 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 13:33:30 +0300 Subject: [PATCH 09/56] std namespace conversion for strings --- opencog/persist/csv/table_read.cc | 94 +++++++++++++++---------------- opencog/persist/csv/table_read.h | 21 +++++-- 2 files changed, 62 insertions(+), 53 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 7d9c8418a7..ea0d621e92 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -56,7 +56,7 @@ using boost::phoenix::arg_names::arg1; // ------------------------------------------------------- -bool checkCarriageReturn(istream& in) +bool checkCarriageReturn(std::istream& in) { char next_c = in.get(); if (next_c == '\r') // DOS format @@ -66,7 +66,7 @@ bool checkCarriageReturn(istream& in) return false; } -void removeCarriageReturn(string& str) +void removeCarriageReturn(std::string& str) { size_t s = str.size(); if ((s > 0) && (str[s-1] == '\r')) @@ -74,7 +74,7 @@ void removeCarriageReturn(string& str) } //* Remove non-ascii characters at the bigining of the line, only. -void removeNonASCII(string& str) +void removeNonASCII(std::string& str) { while (str.size() && (unsigned char)str[0] > 127) str = str.substr(1); @@ -138,16 +138,16 @@ static const char *sparse_delim = " : "; * If no such delimiter is found then it return a pair with empty key * and empty val. */ -static pair -parse_key_val(string chunk) +static std::pair +parse_key_val(const std::string& chunk) { - pair res; + std::pair res; size_t pos = chunk.find(sparse_delim); - if (string::npos == pos) + if (std::string::npos == pos) return res; - string key = chunk.substr(0, pos); + std::string key = chunk.substr(0, pos); boost::trim(key); - string val = chunk.substr(pos + strlen(sparse_delim)); + std::string val = chunk.substr(pos + strlen(sparse_delim)); boost::trim(val); return {key, val}; } @@ -167,7 +167,7 @@ table_tokenizer get_row_tokenizer(const std::string& line) } // Same as above, but only allow commas as a column separator. -table_tokenizer get_sparse_row_tokenizer(const string& line) +table_tokenizer get_sparse_row_tokenizer(const std::string& line) { typedef boost::escaped_list_separator separator; typedef boost::tokenizer tokenizer; @@ -182,7 +182,7 @@ table_tokenizer get_sparse_row_tokenizer(const string& line) * Used by istreamTable. This will modify the line to remove leading * non-ASCII characters, as well as stripping of any carriage-returns. */ -vector tokenizeSparseRow(const string& line) +vector tokenizeSparseRow(const std::string& line) { table_tokenizer tok = get_sparse_row_tokenizer(line); vector res; @@ -198,7 +198,7 @@ vector tokenizeSparseRow(const string& line) * Given an input string, guess the type of the string. * Inferable types are: boolean, contin and enum. */ -type_node infer_type_from_token(const string& token) +type_node infer_type_from_token(const std::string& token) { /* Prefered representation is T's and 0's, to maximize clarity, * readability. Numeric values are easily confused with contin @@ -235,7 +235,7 @@ type_node infer_type_from_token(const string& token) * if it can be done consistently. */ static type_node -infer_type_from_token2(type_node curr_guess, const string& token) +infer_type_from_token2(type_node curr_guess, const std::string& token) { type_node tokt = infer_type_from_token(token); @@ -262,7 +262,7 @@ infer_type_from_token2(type_node curr_guess, const string& token) } /// cast string "token" to a vertex of type "tipe" -ValuePtr token_to_boolean(const string& token) +ValuePtr token_to_boolean(const std::string& token) { if ("0" == token || "F" == token || "f" == token) return createBoolValue(false); @@ -274,7 +274,7 @@ ValuePtr token_to_boolean(const string& token) "Expecting boolean value, got %s", token.c_str()); } -ValuePtr token_to_contin(const string& token) +ValuePtr token_to_contin(const std::string& token) { try { return createFloatValue(lexical_cast(token)); @@ -325,7 +325,7 @@ istream& istreamRawITable(istream& in, ITable& tab, streampos beg = in.tellg(); // Get the entire dataset into memory - string line; + std::string line; std::vector lines; // Read first few by hand. The first might be labels, so we must @@ -525,7 +525,7 @@ istream& istreamSparseITable(istream& in, ITable& tab) // ... unless it isn't. (The header must not contain a colon). vector labs; size_t fixed_arity = 0; - string header; + std::string header; get_data_line(in, header); if (string::npos == header.find(sparse_delim)) { // Determine the arity of the fixed columns @@ -538,7 +538,7 @@ istream& istreamSparseITable(istream& in, ITable& tab) } // Get the entire dataset into memory - string iline; + std::string iline; while (get_data_line(in, iline)) lines.push_back(iline); @@ -558,7 +558,7 @@ istream& istreamSparseITable(istream& in, ITable& tab) // Fixed features may have different types, by column. type_node_seq types(fixed_arity, id::unknown_type); - for (const string& line : lines) { + for (const std::string& line : lines) { vector chunks = tokenizeSparseRow(line); vector::const_iterator pit = chunks.begin(); @@ -570,7 +570,7 @@ istream& istreamSparseITable(istream& in, ITable& tab) for (; pit != chunks.end(); ++pit) { // Rip out the key-value pairs auto key_val = parse_key_val(*pit); - if (key_val == pair()) + if (key_val == pair()) break; // Store the key, uniquely. Store best guess as the type. feats.insert(key_val.first); @@ -584,8 +584,8 @@ istream& istreamSparseITable(istream& in, ITable& tab) // Convert the feature set into a list of labels. // 'index' is a map from feature name to column number. size_t cnt = fixed_arity; - map index; - for (const string& key : feats) { + std::map index; + for (const std::string& key : feats) { types.push_back(feat_type); labs.push_back(key); index[key] = cnt; @@ -598,7 +598,7 @@ istream& istreamSparseITable(istream& in, ITable& tab) from_sparse_tokens_visitor fstv(types, index, fixed_arity); auto fill_line = [&](int i) { - const string& line = lines[i]; + const std::string& line = lines[i]; // Tokenize the line vector chunks = tokenizeSparseRow(line); multi_type_seq row = fstv(chunks); @@ -759,18 +759,18 @@ istream& istreamITable_ignore_indices(istream& in, ITable& tab, * Take a line and return a triple with vector containing the input * elements, output element and timestamp. */ -std::tuple, string, string> +std::tuple, std::string, std::string> tokenizeRowIOT(const std::string& line, const std::vector& ignored_indices, int target_idx, // < 0 == ignored int timestamp_idx) // < 0 == ignored { - std::tuple, string, string> res; + std::tuple, std::string, std::string> res; table_tokenizer toker = get_row_tokenizer(line); int i = 0; for (const std::string& tok : toker) { if (!boost::binary_search(ignored_indices, i)) { - string el = boost::lexical_cast(tok); + std::string el = boost::lexical_cast(tok); if (target_idx == i) std::get<1>(res) = el; else if (timestamp_idx == i) @@ -783,7 +783,7 @@ tokenizeRowIOT(const std::string& line, return res; } -ITable loadITable(const string& file_name, +ITable loadITable(const std::string& file_name, const vector& ignore_features) { OC_ASSERT(!file_name.empty(), "the file name is empty"); @@ -801,7 +801,7 @@ ITable loadITable(const string& file_name, * * WARNING: it assumes the dataset has a header!!! */ -ITable loadITable_optimized(const string& file_name, +ITable loadITable_optimized(const std::string& file_name, const vector& ignore_features) { OC_ASSERT(!file_name.empty(), "the file name is empty"); @@ -831,8 +831,8 @@ ITable loadITable_optimized(const string& file_name, * This is only used for sparse table and could be optimized */ istream& istreamTable_OLD(istream& in, Table& tab, - const string& target_feature, - const vector& ignore_features) + const std::string& target_feature, + const std::vector& ignore_features) { istreamITable(in, tab.itable, ignore_features); @@ -845,7 +845,7 @@ istream& istreamTable_OLD(istream& in, Table& tab, type_node targ_type = tab.itable.get_type(target_feature); - string targ_feat = tab.itable.delete_column(target_feature); + std::string targ_feat = tab.itable.delete_column(target_feature); tab.otable.set_label(targ_feat); tab.otable.set_type(targ_type); @@ -860,8 +860,8 @@ istream& istreamTable_OLD(istream& in, Table& tab, * Warning: only works on dense data with header file. */ istream& istreamTable_ignore_indices(istream& in, Table& tab, - const string& target_feature, - const vector& ignore_indices) + const std::string& target_feature, + const std::vector& ignore_indices) { istreamITable_ignore_indices(in, tab.itable, ignore_indices); @@ -874,7 +874,7 @@ istream& istreamTable_ignore_indices(istream& in, Table& tab, type_node targ_type = tab.itable.get_type(target_feature); - string targ_feat = tab.itable.delete_column(target_feature); + std::string targ_feat = tab.itable.delete_column(target_feature); tab.otable.set_label(targ_feat); tab.otable.set_type(targ_type); @@ -885,8 +885,8 @@ istream& istreamTable_ignore_indices(istream& in, Table& tab, // ================================================================== static istream& -inferTableAttributes(istream& in, const string& target_feature, - const string& timestamp_feature, +inferTableAttributes(istream& in, const std::string& target_feature, + const std::string& timestamp_feature, const vector& ignore_features, type_tree& tt, bool& has_header, bool& is_sparse) { @@ -898,11 +898,11 @@ inferTableAttributes(istream& in, const string& target_feature, // Get a portion of the dataset into memory (cleaning weird stuff) std::vector lines; { - string line; + std::string line; is_sparse = false; while (get_data_line(in, line) && maxline-- > 0) { // It is sparse - is_sparse = is_sparse || string::npos != line.find(sparse_delim); + is_sparse = is_sparse || std::string::npos != line.find(sparse_delim); if (is_sparse) { // just get out // TODO could be simplified, optimized, etc in.seekg(beg); @@ -1013,9 +1013,9 @@ inferTableAttributes(istream& in, const string& target_feature, * 2) Load the actual data. */ istream& istreamTable(istream& in, Table& tab, - const string& target_feature, - const string& timestamp_feature, - const vector& ignore_features) + const std::string& target_feature, + const std::string& timestamp_feature, + const std::vector& ignore_features) { // Infer the properties of the table without loading its content type_tree tt; @@ -1075,7 +1075,7 @@ istreamDenseTable_noHeader(istream& in, Table& tab, const type_tree& tt, bool has_header) { // Get the entire dataset into memory (cleaning weird stuff) - string line; + std::string line; std::vector lines; while (get_data_line(in, line)) lines.push_back(line); @@ -1107,14 +1107,14 @@ istreamDenseTable_noHeader(istream& in, Table& tab, tab.itable[i] = ftv(std::get<0>(tokenIOT)); // Fill output - string output_str = std::get<1>(tokenIOT); + std::string output_str = std::get<1>(tokenIOT); // If there is no valid target index, then there is no // "output" column! if ("" != output_str) tab.otable[i] = token_to_vertex(otype, output_str); // Fill date - string date_str = std::get<2>(tokenIOT); + std::string date_str = std::get<2>(tokenIOT); // If there is no valid timestamp index, then there is no // "output" column! if ("" != date_str) @@ -1145,8 +1145,8 @@ istreamDenseTable_noHeader(istream& in, Table& tab, } istream& istreamDenseTable(istream& in, Table& tab, - const string& target_feature, - const string& timestamp_feature, + const std::string& target_feature, + const std::string& timestamp_feature, const vector& ignore_features, const type_tree& tt, bool has_header) { @@ -1165,7 +1165,7 @@ istream& istreamDenseTable(istream& in, Table& tab, int timestamp_idx = -1; // disabled by default vector ignore_idxs; if (has_header) { - string line; + std::string line; get_data_line(in, line); vector header = tokenizeRow(line); diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index 46c5f72c7b..bf428324e7 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -102,6 +102,15 @@ static std::vector tokenizeRow ( return res; } +// =========================================================== + +// TODO Should this be a TableValue? +class Table : public std::vector +{ + public: + Table(void); +}; + // =========================================================== typedef std::vector string_seq; @@ -109,10 +118,10 @@ typedef std::vector string_seq; string_seq get_header(const std::string& input_file); std::istream& istreamRawITable( - std::istream& in, ITable& tab, + std::istream& in, Table& tab, const std::vector& ignored_indices=std::vector()); -std::istream& istreamITable(std::istream& in, ITable& tab, +std::istream& istreamITable(std::istream& in, Table& tab, const string_seq& ignore_features); std::istream& istreamTable(std::istream& in, Table& tab, @@ -120,11 +129,11 @@ std::istream& istreamTable(std::istream& in, Table& tab, // TODO: reimplement loadITable with the same model of loadTable and // remove loadITable_optimized -ITable loadITable( +Table loadITable( const std::string& file_name, const string_seq& ignore_features=string_seq()); -ITable loadITable_optimized( +Table loadITable_optimized( const std::string& file_name, const string_seq& ignore_features=string_seq()); @@ -136,9 +145,9 @@ Table loadTable( const std::string& file_name, const string_seq& ignore_features=string_seq()); -std::istream& istreamDenseTable(std::istream& in, Table& tab, +std::istream& istreamDenseTable(std::istream&, Table&, const string_seq& ignore_features, - const type_tree& tt, bool has_header); + const std::vector&, bool has_header); } // ~namespaces opencog From 15a338e2545517e0ed946b812dcd4a1451c25ce9 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 13:45:26 +0300 Subject: [PATCH 10/56] More std namespace and atomese conversions --- opencog/persist/csv/table_read.cc | 148 +++++++++++++++--------------- 1 file changed, 75 insertions(+), 73 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index ea0d621e92..9f0f36c1e0 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -45,10 +46,11 @@ #include #include #include +#include #include "table_read.h" -namespace opencog { +using namespace opencog; using namespace boost; using namespace boost::phoenix; @@ -58,26 +60,26 @@ using boost::phoenix::arg_names::arg1; bool checkCarriageReturn(std::istream& in) { - char next_c = in.get(); - if (next_c == '\r') // DOS format - next_c = in.get(); - if (next_c == '\n') - return true; - return false; + char next_c = in.get(); + if (next_c == '\r') // DOS format + next_c = in.get(); + if (next_c == '\n') + return true; + return false; } void removeCarriageReturn(std::string& str) { - size_t s = str.size(); - if ((s > 0) && (str[s-1] == '\r')) - str.resize(s-1); + size_t s = str.size(); + if ((s > 0) && (str[s-1] == '\r')) + str.resize(s-1); } //* Remove non-ascii characters at the bigining of the line, only. void removeNonASCII(std::string& str) { - while (str.size() && (unsigned char)str[0] > 127) - str = str.substr(1); + while (str.size() && (unsigned char)str[0] > 127) + str = str.substr(1); } // ------------------------------------------------------- @@ -86,13 +88,13 @@ void removeNonASCII(std::string& str) // of hash, bang or semicolon. bool is_comment(const char c) { - if ('#' == c) return true; - if (';' == c) return true; - if ('!' == c) return true; - if ('\n' == c) return true; - if ('\r' == c) return true; - if (0 == c) return true; - return false; + if ('#' == c) return true; + if (';' == c) return true; + if ('!' == c) return true; + if ('\n' == c) return true; + if ('\r' == c) return true; + if (0 == c) return true; + return false; } /// Get one line of actual data. @@ -106,19 +108,19 @@ bool is_comment(const char c) // std::istream& get_data_line(std::istream& is, std::string& line) { - while (true) - { - getline(is, line); - if (!is) return is; - if (is_comment(line[0])) continue; + while (true) + { + getline(is, line); + if (!is) return is; + if (is_comment(line[0])) continue; - // Remove weird symbols at the start of the line (only). - removeNonASCII(line); - // Remove carriage return at end of line (for DOS files). - removeCarriageReturn(line); + // Remove weird symbols at the start of the line (only). + removeNonASCII(line); + // Remove carriage return at end of line (for DOS files). + removeCarriageReturn(line); - return is; - } + return is; + } } // ------------------------------------------------------- @@ -141,15 +143,15 @@ static const char *sparse_delim = " : "; static std::pair parse_key_val(const std::string& chunk) { - std::pair res; - size_t pos = chunk.find(sparse_delim); - if (std::string::npos == pos) - return res; - std::string key = chunk.substr(0, pos); - boost::trim(key); - std::string val = chunk.substr(pos + strlen(sparse_delim)); - boost::trim(val); - return {key, val}; + std::pair res; + size_t pos = chunk.find(sparse_delim); + if (std::string::npos == pos) + return res; + std::string key = chunk.substr(0, pos); + boost::trim(key); + std::string val = chunk.substr(pos + strlen(sparse_delim)); + boost::trim(val); + return {key, val}; } /** @@ -158,23 +160,23 @@ parse_key_val(const std::string& chunk) */ table_tokenizer get_row_tokenizer(const std::string& line) { - typedef boost::escaped_list_separator separator; - typedef boost::tokenizer tokenizer; + typedef boost::escaped_list_separator separator; + typedef boost::tokenizer tokenizer; - // Tokenize line; currently, we allow tabs, commas, blanks. - static const separator sep("\\", ",\t ", "\""); - return tokenizer(line, sep); + // Tokenize line; currently, we allow tabs, commas, blanks. + static const separator sep("\\", ",\t ", "\""); + return tokenizer(line, sep); } // Same as above, but only allow commas as a column separator. table_tokenizer get_sparse_row_tokenizer(const std::string& line) { - typedef boost::escaped_list_separator separator; - typedef boost::tokenizer tokenizer; + typedef boost::escaped_list_separator separator; + typedef boost::tokenizer tokenizer; - // Tokenize line; currently, we allow tabs, commas, blanks. - static const separator sep("\\", ",", "\""); - return tokenizer(line, sep); + // Tokenize line; currently, we allow tabs, commas, blanks. + static const separator sep("\\", ",", "\""); + return tokenizer(line, sep); } /** @@ -182,15 +184,15 @@ table_tokenizer get_sparse_row_tokenizer(const std::string& line) * Used by istreamTable. This will modify the line to remove leading * non-ASCII characters, as well as stripping of any carriage-returns. */ -vector tokenizeSparseRow(const std::string& line) +std::vector tokenizeSparseRow(const std::string& line) { - table_tokenizer tok = get_sparse_row_tokenizer(line); - vector res; - for (string t : tok) { - boost::trim(t); - res.push_back(t); - } - return res; + table_tokenizer tok = get_sparse_row_tokenizer(line); + std::vector res; + for (std::string t : tok) { + boost::trim(t); + res.push_back(t); + } + return res; } // ------------------------------------------------------- @@ -198,11 +200,11 @@ vector tokenizeSparseRow(const std::string& line) * Given an input string, guess the type of the string. * Inferable types are: boolean, contin and enum. */ -type_node infer_type_from_token(const std::string& token) +Type infer_type_from_token(const std::string& token) { /* Prefered representation is T's and 0's, to maximize clarity, - * readability. Numeric values are easily confused with contin - * type. + * readability. Numeric values are easily confused with floating + * point type. */ if (token == "0" || token == "1" || @@ -210,20 +212,20 @@ type_node infer_type_from_token(const std::string& token) token == "F" || token == "t" || token == "f") - return id::boolean_type; + return BOOL_VALUE; // If it starts with an alphabetic character, assume its a string else if (isalpha(token[0])) - return id::enum_type; + return STRING_VALUE; // Hope that we can cast this to a float point number. else { try { - lexical_cast(token); - return id::contin_type; + boost::lexical_cast(token); + return FLOAT_VALUE; } catch(...) { - return id::ill_formed_type; + return VOID_VALUE; } } } @@ -234,13 +236,13 @@ type_node infer_type_from_token(const std::string& token) * Compare this to 'curr_guess', and upgrade the type inference * if it can be done consistently. */ -static type_node -infer_type_from_token2(type_node curr_guess, const std::string& token) +static Type +infer_type_from_token2(Type curr_guess, const std::string& token) { - type_node tokt = infer_type_from_token(token); + Type tokt = infer_type_from_token(token); // First time, just go with the flow. - if (id::unknown_type == curr_guess) + if (VOID_VALUE == curr_guess) return tokt; // Yayy! its consistent! @@ -248,17 +250,17 @@ infer_type_from_token2(type_node curr_guess, const std::string& token) return tokt; // If we saw 0,1 when expecting a contin, its a contin. - if ((id::contin_type == curr_guess) && (id::boolean_type == tokt)) + if ((FLOAT_VALUE == curr_guess) && (BOOL_VALUE == tokt)) return curr_guess; // If we thought its a boolean 0,1 it might be a contin. - if ((id::boolean_type == curr_guess) && (id::contin_type == tokt)) + if ((BOOL_VALUE == curr_guess) && (FLOAT_VALUE == tokt)) return tokt; // If we got to here, then there's some sort of unexpected // inconsistency in the column types; we've got to presume that // its just some crazy ascii string, i.e. enum_type. - return id::enum_type; + return STRING_VALUE; } /// cast string "token" to a vertex of type "tipe" @@ -1218,4 +1220,4 @@ Table loadTable(const std::string& file_name, return res; } -} // ~namespaces opencog +// ================================================================== From 4c5aac801ab997339e841a2b98914bd82cbdc56e Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 13:55:42 +0300 Subject: [PATCH 11/56] More namespace conversions --- opencog/persist/csv/table_read.cc | 157 ++++++++++++++++-------------- opencog/persist/csv/table_read.h | 19 ---- 2 files changed, 83 insertions(+), 93 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 9f0f36c1e0..08fd9435cd 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -58,7 +58,11 @@ using boost::phoenix::arg_names::arg1; // ------------------------------------------------------- -bool checkCarriageReturn(std::istream& in) +/** + * Return true if the next chars in 'in' correspond to carriage return + * (support UNIX and DOS format) and advance in of the checked chars. + */ +static bool checkCarriageReturn(std::istream& in) { char next_c = in.get(); if (next_c == '\r') // DOS format @@ -68,15 +72,20 @@ bool checkCarriageReturn(std::istream& in) return false; } -void removeCarriageReturn(std::string& str) +/** + * remove the carriage return (for DOS format) + */ +static void removeCarriageReturn(std::string& str) { size_t s = str.size(); if ((s > 0) && (str[s-1] == '\r')) str.resize(s-1); } -//* Remove non-ascii characters at the bigining of the line, only. -void removeNonASCII(std::string& str) +/** + * remove non ASCII char at the begining of the string + */ +static void removeNonASCII(std::string& str) { while (str.size() && (unsigned char)str[0] > 127) str = str.substr(1); @@ -86,7 +95,7 @@ void removeNonASCII(std::string& str) // Return true if the character is one of the standard comment // delimiters. Here, we define a 'standard delimiter' as one // of hash, bang or semicolon. -bool is_comment(const char c) +static bool is_comment(const char c) { if ('#' == c) return true; if (';' == c) return true; @@ -264,7 +273,7 @@ infer_type_from_token2(Type curr_guess, const std::string& token) } /// cast string "token" to a vertex of type "tipe" -ValuePtr token_to_boolean(const std::string& token) +static ValuePtr token_to_boolean(const std::string& token) { if ("0" == token || "F" == token || "f" == token) return createBoolValue(false); @@ -272,21 +281,21 @@ ValuePtr token_to_boolean(const std::string& token) if ("1" == token || "T" == token || "t" == token) return createBoolValue(true); - throw RuntimeError(TRACE_INFO, + throw SyntaxException(TRACE_INFO, "Expecting boolean value, got %s", token.c_str()); } -ValuePtr token_to_contin(const std::string& token) +static ValuePtr token_to_contin(const std::string& token) { try { return createFloatValue(lexical_cast(token)); } catch (boost::bad_lexical_cast&) { - throw RuntimeError(TRACE_INFO, + throw SyntaxException(TRACE_INFO, "Could not cast %s to floating point", token.c_str()); } } -ValuePtr token_to_vertex(Type tipe, const std::string& token) +ValuePtr opencog::token_to_vertex(Type tipe, const std::string& token) { if (BOOL_VALUE == tipe) return token_to_boolean(token); @@ -300,14 +309,14 @@ ValuePtr token_to_vertex(Type tipe, const std::string& token) if (isalpha(token[0])) return createStringValue(token); - throw RuntimeError(TRACE_INFO, + throw SyntaxException(TRACE_INFO, "Enum type must begin with alphabetic char, but %s doesn't", token.c_str()); } - stringstream ss; - ss << "Unable to convert token \"" << token << "\" to type=" << tipe << endl; - throw RuntimeError(TRACE_INFO, "%s", ss.str().c_str()); + throw SyntaxException(TRACE_INFO, + "Unable to convert token \"%s\" to type=%d", + token.c_str(), tipe); } // =========================================================== @@ -321,74 +330,74 @@ ValuePtr token_to_vertex(Type tipe, const std::string& token) * the appropriate type, and thunking for the header, and ignoring * certain features, must all be done as a separate step. */ -istream& istreamRawITable(istream& in, ITable& tab, - const vector& ignored_indices) +std::istream& istreamRawITable(std::istream& in, Table& tab, + const std::vector& ignored_indices) { - streampos beg = in.tellg(); - - // Get the entire dataset into memory - std::string line; - std::vector lines; - - // Read first few by hand. The first might be labels, so we must - // get at least the second line. But the second line might have - // all default feature values (i.e. no colon), so get the third... - dorepeat(20) { - if (!get_data_line(in, line)) - break; - // If it is a sparse file, we are outta here. - // Throw an std::exception, since we don't want to log this as an - // error (all the other exception types log to the log file). - if (string::npos != line.find (sparse_delim)) { - in.seekg(beg); - throw std::exception(); - } - lines.push_back(line); - } - - // Grab the rest of the file. - while (get_data_line(in, line)) - lines.push_back(line); + std::streampos beg = in.tellg(); + + // Get the entire dataset into memory + std::string line; + std::vector lines; + + // Read first few by hand. The first might be labels, so we must + // get at least the second line. But the second line might have + // all default feature values (i.e. no colon), so get the third... + dorepeat(20) { + if (!get_data_line(in, line)) + break; + // If it is a sparse file, we are outta here. + // Throw an std::exception, since we don't want to log this as an + // error (all the other exception types log to the log file). + if (string::npos != line.find (sparse_delim)) { + in.seekg(beg); + throw std::exception(); + } + lines.push_back(line); + } - // Determine the arity from the first line. - vector fl = tokenizeRow(lines[0], ignored_indices); - arity_t arity = fl.size(); + // Grab the rest of the file. + while (get_data_line(in, line)) + lines.push_back(line); - std::atomic arity_fail_row(-1); - auto parse_line = [&](size_t i) - { - // tokenize the line and fill the table with - tab[i] = tokenizeRow(lines[i], ignored_indices); + // Determine the arity from the first line. + vector fl = tokenizeRow(lines[0], ignored_indices); + arity_t arity = fl.size(); - // Check arity - if (arity != (arity_t)tab[i].size()) - arity_fail_row = i + 1; - }; - - // Vector of indices [0, lines.size()) - size_t ls = lines.size(); - tab.resize(ls); - auto ir = boost::irange((size_t)0, ls); - vector indices(ir.begin(), ir.end()); - OMP_ALGO::for_each(indices.begin(), indices.end(), parse_line); - - if (-1 != arity_fail_row) { - in.seekg(beg); - OC_ASSERT(false, - "ERROR: Input file inconsistent: the %uth row has " - "a different number of columns than the rest of the file. " - "All rows should have the same number of columns.\n", - arity_fail_row.load()); - } - return in; + std::atomic arity_fail_row(-1); + auto parse_line = [&](size_t i) + { + // tokenize the line and fill the table with + tab[i] = tokenizeRow(lines[i], ignored_indices); + + // Check arity + if (arity != (arity_t)tab[i].size()) + arity_fail_row = i + 1; + }; + + // Vector of indices [0, lines.size()) + size_t ls = lines.size(); + tab.resize(ls); + auto ir = boost::irange((size_t)0, ls); + vector indices(ir.begin(), ir.end()); + OMP_ALGO::for_each(indices.begin(), indices.end(), parse_line); + + if (-1 != arity_fail_row) { + in.seekg(beg); + OC_ASSERT(false, + "ERROR: Input file inconsistent: the %uth row has " + "a different number of columns than the rest of the file. " + "All rows should have the same number of columns.\n", + arity_fail_row.load()); + } + return in; } std::vector get_header(const std::string& file_name) { - std::ifstream in(file_name.c_str()); - std::string line; - get_data_line(in, line); - return tokenizeRow(line); + std::ifstream in(file_name.c_str()); + std::string line; + get_data_line(in, line); + return tokenizeRow(line); } // =========================================================== diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index bf428324e7..4011876588 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -41,30 +41,11 @@ namespace opencog { -/** - * remove the carriage return (for DOS format) - */ -void removeCarriageReturn(std::string& str); - -/** - * remove non ASCII char at the begining of the string - */ -void removeNonASCII(std::string& str); - -/** - * Return true if the next chars in 'in' correspond to carriage return - * (support UNIX and DOS format) and advance in of the checked chars. - */ -bool checkCarriageReturn(std::istream& in); - /** * Convert strings to typed values */ -ValuePtr token_to_boolean(const std::string&); -ValuePtr token_to_contin(const std::string&); ValuePtr token_to_vertex(Type, const std::string&); - // =========================================================== typedef boost::tokenizer> table_tokenizer; From a8e17056453e25f20f5373b05293f62a0194f480 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 14:10:47 +0300 Subject: [PATCH 12/56] More conversions --- opencog/persist/csv/table_read.cc | 23 +++++++++++++---------- opencog/persist/csv/table_read.h | 2 +- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 08fd9435cd..5ea9d1236e 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -330,8 +330,8 @@ ValuePtr opencog::token_to_vertex(Type tipe, const std::string& token) * the appropriate type, and thunking for the header, and ignoring * certain features, must all be done as a separate step. */ -std::istream& istreamRawITable(std::istream& in, Table& tab, - const std::vector& ignored_indices) +std::istream& istreamRawITable(std::istream& in, std::vector& tab, + const std::vector& ignored_indices) { std::streampos beg = in.tellg(); @@ -342,13 +342,16 @@ std::istream& istreamRawITable(std::istream& in, Table& tab, // Read first few by hand. The first might be labels, so we must // get at least the second line. But the second line might have // all default feature values (i.e. no colon), so get the third... - dorepeat(20) { + dorepeat(20) + { if (!get_data_line(in, line)) break; + // If it is a sparse file, we are outta here. // Throw an std::exception, since we don't want to log this as an // error (all the other exception types log to the log file). - if (string::npos != line.find (sparse_delim)) { + if (std::string::npos != line.find (sparse_delim)) + { in.seekg(beg); throw std::exception(); } @@ -360,17 +363,17 @@ std::istream& istreamRawITable(std::istream& in, Table& tab, lines.push_back(line); // Determine the arity from the first line. - vector fl = tokenizeRow(lines[0], ignored_indices); - arity_t arity = fl.size(); + std::vector fl = tokenizeRow(lines[0], ignored_indices); + size_t arity = fl.size(); std::atomic arity_fail_row(-1); auto parse_line = [&](size_t i) { // tokenize the line and fill the table with - tab[i] = tokenizeRow(lines[i], ignored_indices); + tab[i] = tokenizeRow(lines[i], ignored_indices); // Check arity - if (arity != (arity_t)tab[i].size()) + if (arity != tab[i].size()) arity_fail_row = i + 1; }; @@ -378,12 +381,12 @@ std::istream& istreamRawITable(std::istream& in, Table& tab, size_t ls = lines.size(); tab.resize(ls); auto ir = boost::irange((size_t)0, ls); - vector indices(ir.begin(), ir.end()); + std::vector indices(ir.begin(), ir.end()); OMP_ALGO::for_each(indices.begin(), indices.end(), parse_line); if (-1 != arity_fail_row) { in.seekg(beg); - OC_ASSERT(false, + throw SyntaxException(TRACE_INFO, "ERROR: Input file inconsistent: the %uth row has " "a different number of columns than the rest of the file. " "All rows should have the same number of columns.\n", diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index 4011876588..25b3aa5251 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -99,7 +99,7 @@ typedef std::vector string_seq; string_seq get_header(const std::string& input_file); std::istream& istreamRawITable( - std::istream& in, Table& tab, + std::istream& in, std::vector& table, const std::vector& ignored_indices=std::vector()); std::istream& istreamITable(std::istream& in, Table& tab, From cf8743bf7400653a8e3256330a8207224454211e Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 14:15:10 +0300 Subject: [PATCH 13/56] White-space conversion --- opencog/persist/csv/table_read.cc | 168 +++++++++++++++--------------- 1 file changed, 85 insertions(+), 83 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 5ea9d1236e..3c8ddf1bd2 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -409,37 +409,39 @@ std::vector get_header(const std::string& file_name) * into a multi_type_seq containing the typed values given the input * type signature. */ -struct from_tokens_visitor : public boost::static_visitor +struct from_tokens_visitor : public boost::static_visitor { - from_tokens_visitor(const type_node_seq& types) : _types(types) { - all_boolean = boost::count(types, id::boolean_type) == (int)types.size(); - all_contin = boost::count(types, id::contin_type) == (int)types.size(); - } - result_type operator()(const string_seq& seq) { - result_type res; - if (all_boolean) { - res = builtin_seq(); - builtin_seq& bs = res.get_seq(); - boost::transform(seq, back_inserter(bs), token_to_boolean); - } - else if (all_contin) { - res = contin_seq(); - contin_seq& cs = res.get_seq(); - boost::transform(seq, back_inserter(cs), token_to_contin); - } - else { - res = vertex_seq(); - vertex_seq& vs = res.get_seq(); - boost::transform(_types, seq, back_inserter(vs), token_to_vertex); - } - return res; - } - template result_type operator()(const Seq& seq) { - OC_ASSERT(false, "You are not supposed to do that"); - return result_type(); - } - const type_node_seq& _types; - bool all_boolean, all_contin; + from_tokens_visitor(const std::vector& types) : _types(types) + { + all_boolean = boost::count(types, BOOL_VALUE) == (int)types.size(); + all_contin = boost::count(types, FLOAT_VALUE) == (int)types.size(); + } + result_type operator()(const string_seq& seq) + { + result_type res; + if (all_boolean) { + res = builtin_seq(); + builtin_seq& bs = res.get_seq(); + boost::transform(seq, back_inserter(bs), token_to_boolean); + } + else if (all_contin) { + res = contin_seq(); + contin_seq& cs = res.get_seq(); + boost::transform(seq, back_inserter(cs), token_to_contin); + } + else { + res = vertex_seq(); + vertex_seq& vs = res.get_seq(); + boost::transform(_types, seq, back_inserter(vs), token_to_vertex); + } + return res; + } + template result_type operator()(const Seq& seq) { + OC_ASSERT(false, "You are not supposed to do that"); + return result_type(); + } + const type_node_seq& _types; + bool all_boolean, all_contin; }; @@ -448,59 +450,59 @@ struct from_tokens_visitor : public boost::static_visitor */ struct from_sparse_tokens_visitor : public from_tokens_visitor { - from_sparse_tokens_visitor(const type_node_seq& types, - const std::map& index, - size_t fixed_arity) - : from_tokens_visitor(types), _index(index), _fixed_arity(fixed_arity) {} - result_type operator()(const string_seq& seq) { - using std::transform; - using std::for_each; - result_type res; - if (all_boolean) { - res = builtin_seq(_types.size(), id::logical_false); - builtin_seq& bs = res.get_seq(); - auto begin_sparse = seq.begin() + _fixed_arity; - transform(seq.begin(), begin_sparse, bs.begin(), token_to_boolean); - for (auto it = begin_sparse; it != seq.end(); ++it) { - auto key_val = parse_key_val(*it); - if (key_val != std::pair()) { - size_t idx = _index.at(key_val.first); - bs[idx] = token_to_boolean(key_val.second); - } - } - } - else if (all_contin) { - res = contin_seq(_types.size(), 0.0); - contin_seq& cs = res.get_seq(); - auto begin_sparse = seq.cbegin() + _fixed_arity; - transform(seq.begin(), begin_sparse, cs.begin(), token_to_contin); - for (auto it = begin_sparse; it != seq.end(); ++it) { - auto key_val = parse_key_val(*it); - if (key_val != std::pair()) { - size_t idx = _index.at(key_val.first); - cs[idx] = token_to_contin(key_val.second); - } - } - } - else { - res = vertex_seq(_types.size()); - vertex_seq& vs = res.get_seq(); - auto begin_sparse_types = _types.cbegin() + _fixed_arity; - auto begin_sparse_seq = seq.cbegin() + _fixed_arity; - transform(_types.begin(), begin_sparse_types, - seq.begin(), vs.begin(), token_to_vertex); - for (auto it = begin_sparse_seq; it != seq.end(); ++it) { - auto key_val = parse_key_val(*it); - if (key_val != std::pair()) { - size_t idx = _index.at(key_val.first); - vs[idx] = token_to_vertex(_types[idx], key_val.second); - } - } - } - return res; - } - std::map _index; - size_t _fixed_arity; + from_sparse_tokens_visitor(const type_node_seq& types, + const std::map& index, + size_t fixed_arity) + : from_tokens_visitor(types), _index(index), _fixed_arity(fixed_arity) {} + result_type operator()(const string_seq& seq) { + using std::transform; + using std::for_each; + result_type res; + if (all_boolean) { + res = builtin_seq(_types.size(), id::logical_false); + builtin_seq& bs = res.get_seq(); + auto begin_sparse = seq.begin() + _fixed_arity; + transform(seq.begin(), begin_sparse, bs.begin(), token_to_boolean); + for (auto it = begin_sparse; it != seq.end(); ++it) { + auto key_val = parse_key_val(*it); + if (key_val != std::pair()) { + size_t idx = _index.at(key_val.first); + bs[idx] = token_to_boolean(key_val.second); + } + } + } + else if (all_contin) { + res = contin_seq(_types.size(), 0.0); + contin_seq& cs = res.get_seq(); + auto begin_sparse = seq.cbegin() + _fixed_arity; + transform(seq.begin(), begin_sparse, cs.begin(), token_to_contin); + for (auto it = begin_sparse; it != seq.end(); ++it) { + auto key_val = parse_key_val(*it); + if (key_val != std::pair()) { + size_t idx = _index.at(key_val.first); + cs[idx] = token_to_contin(key_val.second); + } + } + } + else { + res = vertex_seq(_types.size()); + vertex_seq& vs = res.get_seq(); + auto begin_sparse_types = _types.cbegin() + _fixed_arity; + auto begin_sparse_seq = seq.cbegin() + _fixed_arity; + transform(_types.begin(), begin_sparse_types, + seq.begin(), vs.begin(), token_to_vertex); + for (auto it = begin_sparse_seq; it != seq.end(); ++it) { + auto key_val = parse_key_val(*it); + if (key_val != std::pair()) { + size_t idx = _index.at(key_val.first); + vs[idx] = token_to_vertex(_types[idx], key_val.second); + } + } + } + return res; + } + std::map _index; + size_t _fixed_arity; }; From 2556dbd74cb67ccc3971af94c7e70b8659ed3c7a Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 15:14:38 +0300 Subject: [PATCH 14/56] Ongoing conversion efforts --- opencog/atoms/value/README.md | 6 + opencog/persist/csv/table_read.cc | 246 +++++++++--------------------- opencog/persist/csv/table_read.h | 16 +- 3 files changed, 89 insertions(+), 179 deletions(-) diff --git a/opencog/atoms/value/README.md b/opencog/atoms/value/README.md index 95c2eb6ff0..60662eb7a8 100644 --- a/opencog/atoms/value/README.md +++ b/opencog/atoms/value/README.md @@ -94,4 +94,10 @@ Adding New Atom and Value Types Please see the [README-Adding-New-Atom-Types.md](../atom_types/README-Adding-New-Atom-Types.md) file. +See also the [Custom Types Example](../../../examples/type-system/README.md) +TODO +---- +* Perhaps add a TypeValue, which would be a vector of Types. If could + be useful as a kind-of table signature (for the csv table handling + code). diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 3c8ddf1bd2..22854331bc 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -330,7 +330,7 @@ ValuePtr opencog::token_to_vertex(Type tipe, const std::string& token) * the appropriate type, and thunking for the header, and ignoring * certain features, must all be done as a separate step. */ -std::istream& istreamRawITable(std::istream& in, std::vector& tab, +std::istream& istreamRawITable(std::istream& in, ITable& tab, const std::vector& ignored_indices) { std::streampos beg = in.tellg(); @@ -395,118 +395,9 @@ std::istream& istreamRawITable(std::istream& in, std::vector& tab, return in; } -std::vector get_header(const std::string& file_name) -{ - std::ifstream in(file_name.c_str()); - std::string line; - get_data_line(in, line); - return tokenizeRow(line); -} - // =========================================================== -/** - * Visitor to parse a list of strings (buried in a multi_type_seq) - * into a multi_type_seq containing the typed values given the input - * type signature. - */ -struct from_tokens_visitor : public boost::static_visitor -{ - from_tokens_visitor(const std::vector& types) : _types(types) - { - all_boolean = boost::count(types, BOOL_VALUE) == (int)types.size(); - all_contin = boost::count(types, FLOAT_VALUE) == (int)types.size(); - } - result_type operator()(const string_seq& seq) - { - result_type res; - if (all_boolean) { - res = builtin_seq(); - builtin_seq& bs = res.get_seq(); - boost::transform(seq, back_inserter(bs), token_to_boolean); - } - else if (all_contin) { - res = contin_seq(); - contin_seq& cs = res.get_seq(); - boost::transform(seq, back_inserter(cs), token_to_contin); - } - else { - res = vertex_seq(); - vertex_seq& vs = res.get_seq(); - boost::transform(_types, seq, back_inserter(vs), token_to_vertex); - } - return res; - } - template result_type operator()(const Seq& seq) { - OC_ASSERT(false, "You are not supposed to do that"); - return result_type(); - } - const type_node_seq& _types; - bool all_boolean, all_contin; -}; - -/** - * The class below tokenizes one row, and jams it into the table - */ -struct from_sparse_tokens_visitor : public from_tokens_visitor -{ - from_sparse_tokens_visitor(const type_node_seq& types, - const std::map& index, - size_t fixed_arity) - : from_tokens_visitor(types), _index(index), _fixed_arity(fixed_arity) {} - result_type operator()(const string_seq& seq) { - using std::transform; - using std::for_each; - result_type res; - if (all_boolean) { - res = builtin_seq(_types.size(), id::logical_false); - builtin_seq& bs = res.get_seq(); - auto begin_sparse = seq.begin() + _fixed_arity; - transform(seq.begin(), begin_sparse, bs.begin(), token_to_boolean); - for (auto it = begin_sparse; it != seq.end(); ++it) { - auto key_val = parse_key_val(*it); - if (key_val != std::pair()) { - size_t idx = _index.at(key_val.first); - bs[idx] = token_to_boolean(key_val.second); - } - } - } - else if (all_contin) { - res = contin_seq(_types.size(), 0.0); - contin_seq& cs = res.get_seq(); - auto begin_sparse = seq.cbegin() + _fixed_arity; - transform(seq.begin(), begin_sparse, cs.begin(), token_to_contin); - for (auto it = begin_sparse; it != seq.end(); ++it) { - auto key_val = parse_key_val(*it); - if (key_val != std::pair()) { - size_t idx = _index.at(key_val.first); - cs[idx] = token_to_contin(key_val.second); - } - } - } - else { - res = vertex_seq(_types.size()); - vertex_seq& vs = res.get_seq(); - auto begin_sparse_types = _types.cbegin() + _fixed_arity; - auto begin_sparse_seq = seq.cbegin() + _fixed_arity; - transform(_types.begin(), begin_sparse_types, - seq.begin(), vs.begin(), token_to_vertex); - for (auto it = begin_sparse_seq; it != seq.end(); ++it) { - auto key_val = parse_key_val(*it); - if (key_val != std::pair()) { - size_t idx = _index.at(key_val.first); - vs[idx] = token_to_vertex(_types[idx], key_val.second); - } - } - } - return res; - } - std::map _index; - size_t _fixed_arity; -}; - - -// =========================================================== +#if NOT_RIGHT_NOW /** * Fill the input table, given a file in 'sparse' format. * @@ -630,29 +521,32 @@ istream& istreamSparseITable(istream& in, ITable& tab) return in; } +#endif /** * Infer the column types of the input table. It is assumed the * table's rows are vector of strings. */ -type_node_seq infer_column_types(const ITable& tab) +std::vector infer_column_types(const std::vector& tab) { - vector::const_iterator rowit = tab.begin(); + std::vector::const_iterator rowit = tab.begin(); - arity_t arity = rowit->size(); - type_node_seq types(arity, id::unknown_type); + size_t arity = rowit->size(); + std::vector types(arity, VOID_VALUE); - // Skip the first line, it might be a header... - // and that would confuse type inference. - if (tab.size() > 1) - ++rowit; - for (; rowit != tab.end(); ++rowit) - { - const string_seq& tokens = rowit->get_seq(); - for (arity_t i=0; i 1) + ++rowit; + + // Loop over all rows; this performs a consistency check. + for (; rowit != tab.end(); ++rowit) + { + const string_seq& tokens = *rowit; + for (size_t i=0; i& col_types) { - const string_seq& row = tab.begin()->get_seq(); + const string_seq& row = *tab.begin(); - arity_t arity = row.size(); + size_t arity = row.size(); - for (arity_t i=0; i& tokens, const type_node_seq& col_types) +bool is_header(const string_seq& tokens, const std::vector& col_types) { - for (size_t i = 0; i < tokens.size(); i++) { - type_node flt = infer_type_from_token2(col_types[i], tokens[i]); - if ((id::enum_type == flt) && (id::enum_type != col_types[i])) - return true; - } - return false; + for (size_t i = 0; i < tokens.size(); i++) + { + Type flt = infer_type_from_token2(col_types[i], tokens[i]); + if ((STRING_VALUE == flt) && (STRING_VALUE != col_types[i])) + return true; + } + return false; +} + +std::vector get_header(const std::string& file_name) +{ + std::ifstream in(file_name.c_str()); + std::string line; + get_data_line(in, line); + return tokenizeRow(line); } /** @@ -699,41 +603,41 @@ bool is_header(const vector& tokens, const type_node_seq& col_types) * infer the column types, and the presence of a header. */ istream& istreamITable(istream& in, ITable& tab, - const vector& ignore_features) + const std::vector& ignore_features) { - try { - istreamRawITable(in, tab); - } - catch (std::exception& e) { - istreamSparseITable(in, tab); - // Get rid of the unwanted columns. - tab.delete_columns(ignore_features); - return in; - } + istreamRawITable(in, tab); + try { + } + catch (std::exception& e) { + istreamSparseITable(in, tab); + // Get rid of the unwanted columns. + tab.delete_columns(ignore_features); + return in; + } - // Determine the column types. - type_node_seq col_types = infer_column_types(tab); - tab.set_types(col_types); + // Determine the column types. + type_node_seq col_types = infer_column_types(tab); + tab.set_types(col_types); - // If there is a header row, then it must be the column labels. - if (has_header(tab, col_types)) { - tab.set_labels(tab.begin()->get_seq()); - tab.erase(tab.begin()); - } + // If there is a header row, then it must be the column labels. + if (has_header(tab, col_types)) { + tab.set_labels(tab.begin()->get_seq()); + tab.erase(tab.begin()); + } - // Now that we have some column labels to work off of, - // Get rid of the unwanted columns. - tab.delete_columns(ignore_features); + // Now that we have some column labels to work off of, + // Get rid of the unwanted columns. + tab.delete_columns(ignore_features); - // Finally, perform a column type conversion - from_tokens_visitor ftv(tab.get_types()); - auto aft = apply_visitor(ftv); - OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(), - [&](multi_type_seq& seq) { - return aft(seq.get_variant()); - }); + // Finally, perform a column type conversion + from_tokens_visitor ftv(tab.get_types()); + auto aft = apply_visitor(ftv); + OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(), + [&](multi_type_seq& seq) { + return aft(seq.get_variant()); + }); - return in; + return in; } /** diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index 25b3aa5251..4aa885b14d 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -85,24 +85,24 @@ static std::vector tokenizeRow ( // =========================================================== +// TODO: Should this be a StringValue? +typedef std::vector string_seq; + +typedef std::vector ITable; + // TODO Should this be a TableValue? -class Table : public std::vector -{ - public: - Table(void); -}; +typedef std::vector Table; // =========================================================== -typedef std::vector string_seq; // Get the header of a DSV file (assuming there is one) string_seq get_header(const std::string& input_file); std::istream& istreamRawITable( - std::istream& in, std::vector& table, + std::istream& in, ITable& table, const std::vector& ignored_indices=std::vector()); -std::istream& istreamITable(std::istream& in, Table& tab, +std::istream& istreamITable(std::istream& in, ITable& tab, const string_seq& ignore_features); std::istream& istreamTable(std::istream& in, Table& tab, From 4709381d54747d703ae818c65e0726306fe974e3 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 15:34:10 +0300 Subject: [PATCH 15/56] More conversions --- opencog/persist/csv/table_read.cc | 148 ++---------------------------- 1 file changed, 7 insertions(+), 141 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 22854331bc..9d6ee86369 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -397,132 +397,6 @@ std::istream& istreamRawITable(std::istream& in, ITable& tab, // =========================================================== -#if NOT_RIGHT_NOW -/** - * Fill the input table, given a file in 'sparse' format. - * - * The sparse table format consists of some fixed number of columns, - * in comma-separated format, followed by key-value pairs, also - * tab-separated. viz: - * - * val, val, val, name:val, name:val, name:val - * - * Thus, for example, a row such as - * - * earn, issued : 1, results : 2, ending : 1, including : 1 - * - * indicates that there one fixed column, of enum type, (the enum value - * being "earn"), and that features called "issued", "ending" and - * "including" have a contin value of 1.0 and "results" has a contin - * value of 2. - * - * The routine does NOT store the table in sparse format: it stores the - * full, exploded table. This could be bad ... - * TODO: we really need a sparse table format, as well. - * - * The "Raw" format has all data as strings; type conversion to the - * appropriate type, must all be done as a separate step. - */ -istream& istreamSparseITable(istream& in, ITable& tab) -{ - // The raw dataset - std::vector lines; - - // The first non-comment line is assumed to be the header. - // ... unless it isn't. (The header must not contain a colon). - vector labs; - size_t fixed_arity = 0; - std::string header; - get_data_line(in, header); - if (string::npos == header.find(sparse_delim)) { - // Determine the arity of the fixed columns - vector hdr = tokenizeSparseRow(header); - fixed_arity = hdr.size(); - labs = hdr; - } - else { - lines.push_back(header); - } - - // Get the entire dataset into memory - std::string iline; - while (get_data_line(in, iline)) - lines.push_back(iline); - - if (0 == fixed_arity) { - vector fixy = tokenizeSparseRow(lines[0]); - // count commas, until a semi-colon is found. - while (string::npos == fixy[fixed_arity].find(sparse_delim)) - fixed_arity++; - } - logger().info() << "Sparse file fixed column count=" << fixed_arity; - - // Get a list of all of the features. - set feats; - // All sparse features have the same type. - type_node feat_type = id::unknown_type; - - // Fixed features may have different types, by column. - type_node_seq types(fixed_arity, id::unknown_type); - - for (const std::string& line : lines) { - vector chunks = tokenizeSparseRow(line); - vector::const_iterator pit = chunks.begin(); - - // Infer the types of the fixed features. - size_t off = 0; - for (; off < fixed_arity; ++off, ++pit) - types[off] = infer_type_from_token2(types[off], *pit); - - for (; pit != chunks.end(); ++pit) { - // Rip out the key-value pairs - auto key_val = parse_key_val(*pit); - if (key_val == pair()) - break; - // Store the key, uniquely. Store best guess as the type. - feats.insert(key_val.first); - feat_type = infer_type_from_token2(feat_type, key_val.second); - } - } - logger().info() << "Sparse file unique features count=" << feats.size(); - logger().info() << "Sparse file feature type=" << feat_type; - logger().info() << "Sparse file row count=" << lines.size(); - - // Convert the feature set into a list of labels. - // 'index' is a map from feature name to column number. - size_t cnt = fixed_arity; - std::map index; - for (const std::string& key : feats) { - types.push_back(feat_type); - labs.push_back(key); - index[key] = cnt; - cnt++; - } - tab.set_labels(labs); - tab.set_types(types); - - // And finally, stuff up the table. - from_sparse_tokens_visitor fstv(types, index, fixed_arity); - auto fill_line = [&](int i) - { - const std::string& line = lines[i]; - // Tokenize the line - vector chunks = tokenizeSparseRow(line); - multi_type_seq row = fstv(chunks); - tab[i] = row; - }; - - // Vector of indices [0, lines.size()) - size_t ls = lines.size(); - tab.resize(ls); - auto ir = boost::irange((size_t)0, ls); - vector indices(ir.begin(), ir.end()); - OMP_ALGO::for_each(indices.begin(), indices.end(), fill_line); - - return in; -} -#endif - /** * Infer the column types of the input table. It is assumed the * table's rows are vector of strings. @@ -602,32 +476,24 @@ std::vector get_header(const std::string& file_name) * the entire table, as a collection of strings. Next, it tries to * infer the column types, and the presence of a header. */ -istream& istreamITable(istream& in, ITable& tab, - const std::vector& ignore_features) +std::istream& istreamITable(std::istream& in, ITable& tab, + const std::vector& ignore_features) { istreamRawITable(in, tab); - try { - } - catch (std::exception& e) { - istreamSparseITable(in, tab); - // Get rid of the unwanted columns. - tab.delete_columns(ignore_features); - return in; - } // Determine the column types. - type_node_seq col_types = infer_column_types(tab); - tab.set_types(col_types); + std::vector col_types = infer_column_types(tab); // If there is a header row, then it must be the column labels. - if (has_header(tab, col_types)) { - tab.set_labels(tab.begin()->get_seq()); + if (has_header(tab, col_types)) + { + // tab.set_labels(*tab.begin()); tab.erase(tab.begin()); } // Now that we have some column labels to work off of, // Get rid of the unwanted columns. - tab.delete_columns(ignore_features); + // tab.delete_columns(ignore_features); // Finally, perform a column type conversion from_tokens_visitor ftv(tab.get_types()); From 40648536da270884d3c998453aef17455911c623 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 15:41:55 +0300 Subject: [PATCH 16/56] Remove cruft --- opencog/persist/csv/table_read.cc | 173 +++--------------------------- 1 file changed, 16 insertions(+), 157 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 9d6ee86369..1c138d36bf 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -495,6 +495,12 @@ std::istream& istreamITable(std::istream& in, ITable& tab, // Get rid of the unwanted columns. // tab.delete_columns(ignore_features); + // determined ignore_indices + std::vector ignore_indices = get_indices(ignore_features, + get_header(file_name)); + + +.... // Finally, perform a column type conversion from_tokens_visitor ftv(tab.get_types()); auto aft = apply_visitor(ftv); @@ -506,166 +512,19 @@ std::istream& istreamITable(std::istream& in, ITable& tab, return in; } -/** - * Like istreamITable but add the option to ignore indices. - * - * It's akind of a temporary hack, till it's clear that this is much - * faster and we should recode istreamITable to ignore features - * head-on. - * - * Also, it assumes that the dataset is not sparse. - */ -istream& istreamITable_ignore_indices(istream& in, ITable& tab, - const vector& ignore_indices) -{ - istreamRawITable(in, tab, ignore_indices); - - // Determine the column types. - type_node_seq col_types = infer_column_types(tab); - tab.set_types(col_types); - - // If there is a header row, then it must be the column labels. - if (has_header(tab, col_types)) { - tab.set_labels(tab.begin()->get_seq()); - tab.erase(tab.begin()); - } - - // Finally, perform a column type conversion - from_tokens_visitor ftv(tab.get_types()); - auto aft = apply_visitor(ftv); - OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(), - [&](multi_type_seq& seq) { - return aft(seq.get_variant()); - }); - - return in; -} - -/** - * Take a line and return a triple with vector containing the input - * elements, output element and timestamp. - */ -std::tuple, std::string, std::string> -tokenizeRowIOT(const std::string& line, - const std::vector& ignored_indices, - int target_idx, // < 0 == ignored - int timestamp_idx) // < 0 == ignored -{ - std::tuple, std::string, std::string> res; - table_tokenizer toker = get_row_tokenizer(line); - int i = 0; - for (const std::string& tok : toker) { - if (!boost::binary_search(ignored_indices, i)) { - std::string el = boost::lexical_cast(tok); - if (target_idx == i) - std::get<1>(res) = el; - else if (timestamp_idx == i) - std::get<2>(res) = el; - else - std::get<0>(res).push_back(el); - } - i++; - } - return res; -} - ITable loadITable(const std::string& file_name, - const vector& ignore_features) + const std::vector& ignore_features) { - OC_ASSERT(!file_name.empty(), "the file name is empty"); - ifstream in(file_name.c_str()); - OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str()); - - ITable res; - istreamITable(in, res, ignore_features); - return res; -} - -/** - * Like loadITable but it is optimized by ignoring features head-on - * (rather than loading them, then removing them. - * - * WARNING: it assumes the dataset has a header!!! - */ -ITable loadITable_optimized(const std::string& file_name, - const vector& ignore_features) -{ - OC_ASSERT(!file_name.empty(), "the file name is empty"); - ifstream in(file_name.c_str()); - OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str()); - - // determined ignore_indices - vector ignore_indices = get_indices(ignore_features, - get_header(file_name)); - - ITable res; - istreamITable_ignore_indices(in, res, ignore_indices); - return res; -} - -/** - * Fill an input table and output table given a DSV - * (delimiter-seperated values) file format, where delimiters are ',', - * ' ' or '\t'. - * - * It is assumed that each row have the same number of columns, if not - * an assert is raised. - * - * pos specifies the position of the output, if -1 it is the last - * position. The default position is 0, the first column. - * - * This is only used for sparse table and could be optimized - */ -istream& istreamTable_OLD(istream& in, Table& tab, - const std::string& target_feature, - const std::vector& ignore_features) -{ - istreamITable(in, tab.itable, ignore_features); - - tab.otable = tab.itable.get_column_data(target_feature); - OC_ASSERT(0 != tab.otable.size(), - "Fatal Error: target feature \"%s\" not found", - target_feature.c_str()); - - tab.target_pos = tab.itable.get_column_offset(target_feature); - - type_node targ_type = tab.itable.get_type(target_feature); - - std::string targ_feat = tab.itable.delete_column(target_feature); - - tab.otable.set_label(targ_feat); - tab.otable.set_type(targ_type); - - return in; -} - -/** - * Like istreamTable but optimize by ignoring features head-on rather - * than loading them then removing them. - * - * Warning: only works on dense data with header file. - */ -istream& istreamTable_ignore_indices(istream& in, Table& tab, - const std::string& target_feature, - const std::vector& ignore_indices) -{ - istreamITable_ignore_indices(in, tab.itable, ignore_indices); - - tab.otable = tab.itable.get_column_data(target_feature); - OC_ASSERT(0 != tab.otable.size(), - "Fatal Error: target feature \"%s\" not found", - target_feature.c_str()); - - tab.target_pos = tab.itable.get_column_offset(target_feature); - - type_node targ_type = tab.itable.get_type(target_feature); - - std::string targ_feat = tab.itable.delete_column(target_feature); - - tab.otable.set_label(targ_feat); - tab.otable.set_type(targ_type); + if (file_name.empty()) + throw RuntimeException(TRACE_INFO, "The file name is empty!"); + std::ifstream in(file_name.c_str()); + if (not in.is_open()) + throw RuntimeException(TRACE_INFO, + "Could not open %s", file_name.c_str()); - return in; + ITable res; + istreamITable(in, res, ignore_features); + return res; } // ================================================================== From 7f22f38ee8a501329406cf6d383711a5d092b9f6 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 19:47:13 +0300 Subject: [PATCH 17/56] Whitespace rework --- opencog/persist/csv/table_read.cc | 264 ++++++++++++++---------------- opencog/persist/csv/table_read.h | 13 +- 2 files changed, 130 insertions(+), 147 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 1c138d36bf..1b776ef19d 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -43,6 +43,7 @@ #include #include +#include #include #include #include @@ -468,6 +469,8 @@ std::vector get_header(const std::string& file_name) return tokenizeRow(line); } +#if 0 + /** * Fill the input table only, given a DSV (delimiter-seperated values) * file format, where delimiters are ',', ' ' or '\t'. @@ -512,139 +515,125 @@ std::istream& istreamITable(std::istream& in, ITable& tab, return in; } -ITable loadITable(const std::string& file_name, - const std::vector& ignore_features) -{ - if (file_name.empty()) - throw RuntimeException(TRACE_INFO, "The file name is empty!"); - std::ifstream in(file_name.c_str()); - if (not in.is_open()) - throw RuntimeException(TRACE_INFO, - "Could not open %s", file_name.c_str()); - - ITable res; - istreamITable(in, res, ignore_features); - return res; -} +#endif // ================================================================== -static istream& -inferTableAttributes(istream& in, const std::string& target_feature, - const std::string& timestamp_feature, +static std::istream& +inferTableAttributes(std::istream& in, const vector& ignore_features, - type_tree& tt, bool& has_header, bool& is_sparse) + std::vector& tt, bool& has_header) { - // maxline is the maximum number of lines to read to infer the - // attributes. A negative number means reading all lines. - int maxline = 20; - streampos beg = in.tellg(); - // Get a portion of the dataset into memory (cleaning weird stuff) - std::vector lines; - { - std::string line; - is_sparse = false; - while (get_data_line(in, line) && maxline-- > 0) { - // It is sparse - is_sparse = is_sparse || std::string::npos != line.find(sparse_delim); - if (is_sparse) { // just get out - // TODO could be simplified, optimized, etc - in.seekg(beg); - in.clear(); // in case it has reached the eof - return in; - } - - // put the line in a buffer - lines.push_back(line); - } - } + // maxline is the maximum number of lines to read to infer the + // attributes. A negative number means reading all lines. + int maxline = 20; + streampos beg = in.tellg(); - // parse what could be a header - const vector maybe_header = tokenizeRow(lines.front()); - - // determine arity - arity_t arity = maybe_header.size(); - std::atomic arity_fail_row(-1); - - // determine initial type - type_node_seq types(arity, id::unknown_type); - - // parse the rest, determine its type and whether the arity is - // consistent - for (size_t i = 1; i < lines.size(); ++i) { - // Parse line - const string_seq& tokens = tokenizeRow(lines[i]); - - // Check arity - if (arity != (arity_t)tokens.size()) { - arity_fail_row = i + 1; - in.seekg(beg); - in.clear(); // in case it has reached the eof - OC_ASSERT(false, - "ERROR: Input file inconsistent: the %uth row has a " - "different number of columns than the rest of the file. " - "All rows should have the same number of columns.\n", - arity_fail_row.load()); - } + // Get a portion of the dataset into memory (cleaning weird stuff) + std::vector lines; + { + std::string line; + while (get_data_line(in, line) && maxline-- > 0) { + // It is sparse + is_sparse = is_sparse || std::string::npos != line.find(sparse_delim); + if (is_sparse) { // just get out + // TODO could be simplified, optimized, etc + in.seekg(beg); + in.clear(); // in case it has reached the eof + return in; + } + + // put the line in a buffer + lines.push_back(line); + } + } - // Infer type - boost::transform(types, tokens, types.begin(), - infer_type_from_token2); - } + // parse what could be a header + const vector maybe_header = tokenizeRow(lines.front()); - // Determine has_header - has_header = is_header(maybe_header, types); + // determine arity + arity_t arity = maybe_header.size(); + std::atomic arity_fail_row(-1); - // Determine type signature - if (has_header) { + // determine initial type + type_node_seq types(arity, id::unknown_type); - // if unspecified, the target is the first column - unsigned target_idx = 0; + // parse the rest, determine its type and whether the arity is + // consistent + for (size_t i = 1; i < lines.size(); ++i) { + // Parse line + const string_seq& tokens = tokenizeRow(lines[i]); - // target feature will be ignored - if (!target_feature.empty()) { - auto target_it = std::find(maybe_header.begin(), maybe_header.end(), - target_feature); - OC_ASSERT(target_it != maybe_header.end(), "Target %s not found", - target_feature.c_str()); - target_idx = std::distance(maybe_header.begin(), target_it); - } - vector ignore_idxs = - get_indices(ignore_features, maybe_header); - ignore_idxs.push_back(target_idx); - boost::sort(ignore_idxs); + // Check arity + if (arity != (arity_t)tokens.size()) { + arity_fail_row = i + 1; + in.seekg(beg); + in.clear(); // in case it has reached the eof + OC_ASSERT(false, + "ERROR: Input file inconsistent: the %uth row has a " + "different number of columns than the rest of the file. " + "All rows should have the same number of columns.\n", + arity_fail_row.load()); + } - // Include timestamp feature as idx to ignore - if (!timestamp_feature.empty()) { - auto timestamp_it = std::find(maybe_header.begin(), maybe_header.end(), - timestamp_feature); - OC_ASSERT(timestamp_it != maybe_header.end(), - "Timestamp feature %s not found", - timestamp_feature.c_str()); - unsigned timestamp_idx = std::distance(maybe_header.begin(), timestamp_it); - ignore_idxs.push_back(timestamp_idx); - boost::sort(ignore_idxs); - } + // Infer type + boost::transform(types, tokens, types.begin(), + infer_type_from_token2); + } - // Generate type signature - type_node otype = types[target_idx]; - type_node_seq itypes; - for (unsigned i = 0; i < types.size(); ++i) - if (!boost::binary_search(ignore_idxs, i)) - itypes.push_back(types[i]); - tt = gen_signature(itypes, otype); - } else { - // No header, the target is the first column - type_node otype = types[0]; - types.erase(types.begin()); - tt = gen_signature(types, otype); - } - logger().debug() << "Infered type tree: " << tt; + // Determine has_header + has_header = is_header(maybe_header, types); - in.seekg(beg); - in.clear(); // in case it has reached the eof - return in; + // Determine type signature + if (has_header) { + + // if unspecified, the target is the first column + unsigned target_idx = 0; + + // target feature will be ignored + if (!target_feature.empty()) { + auto target_it = std::find(maybe_header.begin(), maybe_header.end(), + target_feature); + OC_ASSERT(target_it != maybe_header.end(), "Target %s not found", + target_feature.c_str()); + target_idx = std::distance(maybe_header.begin(), target_it); + } + vector ignore_idxs = + get_indices(ignore_features, maybe_header); + ignore_idxs.push_back(target_idx); + boost::sort(ignore_idxs); + + // Include timestamp feature as idx to ignore + if (!timestamp_feature.empty()) { + auto timestamp_it = std::find(maybe_header.begin(), maybe_header.end(), + timestamp_feature); + OC_ASSERT(timestamp_it != maybe_header.end(), + "Timestamp feature %s not found", + timestamp_feature.c_str()); + unsigned timestamp_idx = std::distance(maybe_header.begin(), timestamp_it); + ignore_idxs.push_back(timestamp_idx); + boost::sort(ignore_idxs); + } + + // Generate type signature + type_node otype = types[target_idx]; + type_node_seq itypes; + for (unsigned i = 0; i < types.size(); ++i) + if (!boost::binary_search(ignore_idxs, i)) + itypes.push_back(types[i]); + tt = gen_signature(itypes, otype); + } else { + // No header, the target is the first column + type_node otype = types[0]; + types.erase(types.begin()); + tt = gen_signature(types, otype); + } + logger().debug() << "Infered type tree: " << tt; + + in.seekg(beg); + in.clear(); // in case it has reached the eof + return in; } /** @@ -657,17 +646,15 @@ inferTableAttributes(istream& in, const std::string& target_feature, * * 2) Load the actual data. */ -istream& istreamTable(istream& in, Table& tab, - const std::string& target_feature, - const std::string& timestamp_feature, - const std::vector& ignore_features) +std::istream& +istreamTable(const Handle& anchor, + std::istream& in, + const std::vector& ignore_features) { // Infer the properties of the table without loading its content - type_tree tt; - bool has_header, is_sparse; - streampos beg = in.tellg(); - inferTableAttributes(in, target_feature, timestamp_feature, - ignore_features, tt, has_header, is_sparse); + bool has_header; + std::streampos beg = in.tellg(); + inferTableAttributes(in, ignore_features, tt, has_header); in.seekg(beg); if (is_sparse) { @@ -683,6 +670,7 @@ istream& istreamTable(istream& in, Table& tab, // ================================================================== +#if 0 /** * Take a line and return a pair with vector containing the input * elements and then output element. @@ -847,20 +835,22 @@ istream& istreamDenseTable(istream& in, Table& tab, ignore_idxs, tt, has_header); } +#endif + // ================================================================== -Table loadTable(const std::string& file_name, - const std::string& target_feature, - const std::string& timestamp_feature, - const string_seq& ignore_features) +void loadTable(const Handle& anchor, + const std::string& file_name, + const string_seq& ignore_features) { - OC_ASSERT(!file_name.empty(), "the file name is empty"); - ifstream in(file_name.c_str()); - OC_ASSERT(in.is_open(), "Could not open %s", file_name.c_str()); + if (file_name.empty()) + throw RuntimeException(TRACE_INFO, "The file name is empty!"); + std::ifstream in(file_name.c_str()); + if (not in.is_open()) + throw RuntimeException(TRACE_INFO, + "Could not open %s", file_name.c_str()); - Table res; - istreamTable(in, res, target_feature, timestamp_feature, ignore_features); - return res; + istreamTable(acnhro, in, ignore_features); } // ================================================================== diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index 4aa885b14d..7b8164a84b 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -119,17 +119,10 @@ Table loadITable_optimized( const string_seq& ignore_features=string_seq()); /** - * If target_feature is empty then, in case there is no header, it is - * assumed to be the first feature. */ -Table loadTable( - const std::string& file_name, - const string_seq& ignore_features=string_seq()); - -std::istream& istreamDenseTable(std::istream&, Table&, - const string_seq& ignore_features, - const std::vector&, bool has_header); - +void loadTable(const Handle& anchor, + const std::string& file_name, + const string_seq& ignore_features=string_seq()); } // ~namespaces opencog From 277fec8c4e93a87315f30008fbe0c1bbb3dbc704 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 19:59:25 +0300 Subject: [PATCH 18/56] Convert and simplify table reading --- opencog/persist/csv/table_read.cc | 105 +++++++++--------------------- 1 file changed, 30 insertions(+), 75 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 1b776ef19d..53d9ac7c43 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -521,113 +521,68 @@ std::istream& istreamITable(std::istream& in, ITable& tab, static std::istream& inferTableAttributes(std::istream& in, - const vector& ignore_features, + const std::vector& ignore_features, std::vector& tt, bool& has_header) { + has_header = false; + + std::streampos beg = in.tellg(); // maxline is the maximum number of lines to read to infer the // attributes. A negative number means reading all lines. int maxline = 20; - streampos beg = in.tellg(); // Get a portion of the dataset into memory (cleaning weird stuff) - std::vector lines; - { - std::string line; - while (get_data_line(in, line) && maxline-- > 0) { - // It is sparse - is_sparse = is_sparse || std::string::npos != line.find(sparse_delim); - if (is_sparse) { // just get out - // TODO could be simplified, optimized, etc - in.seekg(beg); - in.clear(); // in case it has reached the eof - return in; - } - - // put the line in a buffer - lines.push_back(line); - } - } + std::vector lines; + std::string line; + while (get_data_line(in, line) && maxline-- > 0) + lines.push_back(line); - // parse what could be a header - const vector maybe_header = tokenizeRow(lines.front()); + // Parse what could be a header + const std::vector maybe_header = + tokenizeRow(lines.front()); - // determine arity - arity_t arity = maybe_header.size(); + // Determine arity + size_t arity = maybe_header.size(); std::atomic arity_fail_row(-1); - // determine initial type - type_node_seq types(arity, id::unknown_type); + // Determine initial type + std::vector types(arity, VOID_VALUE); - // parse the rest, determine its type and whether the arity is + // Parse the rest, determine its type and whether the arity is // consistent - for (size_t i = 1; i < lines.size(); ++i) { + for (size_t i = 1; i < lines.size(); ++i) + { // Parse line - const string_seq& tokens = tokenizeRow(lines[i]); + const string_seq& tokens = tokenizeRow(lines[i]); // Check arity - if (arity != (arity_t)tokens.size()) { + if (arity != tokens.size()) + { arity_fail_row = i + 1; in.seekg(beg); in.clear(); // in case it has reached the eof - OC_ASSERT(false, - "ERROR: Input file inconsistent: the %uth row has a " - "different number of columns than the rest of the file. " - "All rows should have the same number of columns.\n", - arity_fail_row.load()); + throw SyntaxException(TRACE_INFO, + "ERROR: Input file inconsistent: the %uth row has a " + "different number of columns than the rest of the file. " + "All rows should have the same number of columns.\n", + arity_fail_row.load()); } // Infer type boost::transform(types, tokens, types.begin(), - infer_type_from_token2); + infer_type_from_token2); } // Determine has_header has_header = is_header(maybe_header, types); // Determine type signature - if (has_header) { - - // if unspecified, the target is the first column - unsigned target_idx = 0; - - // target feature will be ignored - if (!target_feature.empty()) { - auto target_it = std::find(maybe_header.begin(), maybe_header.end(), - target_feature); - OC_ASSERT(target_it != maybe_header.end(), "Target %s not found", - target_feature.c_str()); - target_idx = std::distance(maybe_header.begin(), target_it); - } - vector ignore_idxs = + if (has_header) + { + std::vector ignore_idxs = get_indices(ignore_features, maybe_header); - ignore_idxs.push_back(target_idx); boost::sort(ignore_idxs); - - // Include timestamp feature as idx to ignore - if (!timestamp_feature.empty()) { - auto timestamp_it = std::find(maybe_header.begin(), maybe_header.end(), - timestamp_feature); - OC_ASSERT(timestamp_it != maybe_header.end(), - "Timestamp feature %s not found", - timestamp_feature.c_str()); - unsigned timestamp_idx = std::distance(maybe_header.begin(), timestamp_it); - ignore_idxs.push_back(timestamp_idx); - boost::sort(ignore_idxs); - } - - // Generate type signature - type_node otype = types[target_idx]; - type_node_seq itypes; - for (unsigned i = 0; i < types.size(); ++i) - if (!boost::binary_search(ignore_idxs, i)) - itypes.push_back(types[i]); - tt = gen_signature(itypes, otype); - } else { - // No header, the target is the first column - type_node otype = types[0]; - types.erase(types.begin()); - tt = gen_signature(types, otype); } logger().debug() << "Infered type tree: " << tt; From e45aa9f925ae0bda4006fa240dc9dab4c9f5cd14 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 20:20:43 +0300 Subject: [PATCH 19/56] More cleanup --- opencog/persist/csv/table_read.cc | 122 ++++++++++-------------------- opencog/persist/csv/table_read.h | 15 +--- 2 files changed, 40 insertions(+), 97 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 53d9ac7c43..c7d2dd253e 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -519,10 +519,30 @@ std::istream& istreamITable(std::istream& in, ITable& tab, // ================================================================== +/** + * Get indices (aka positions or offsets) of a list of labels given a + * header. The labels can be sequenced in any order, it will always + * return the order consistent with the header. + */ +static std::vector +get_indices(const string_seq &labels, + const string_seq &header) +{ + std::vector res; + for (size_t i = 0; i < header.size(); ++i) + if (std::find(labels.begin(), labels.end(), header[i]) != labels.end()) + res.push_back(i); + return res; +} + +// ================================================================== + static std::istream& inferTableAttributes(std::istream& in, const std::vector& ignore_features, - std::vector& tt, bool& has_header) + std::vector& ignore_idxs, + std::vector& tt, + bool& has_header) { has_header = false; @@ -580,11 +600,9 @@ inferTableAttributes(std::istream& in, // Determine type signature if (has_header) { - std::vector ignore_idxs = - get_indices(ignore_features, maybe_header); + ignore_idxs = get_indices(ignore_features, maybe_header); boost::sort(ignore_idxs); } - logger().debug() << "Infered type tree: " << tt; in.seekg(beg); in.clear(); // in case it has reached the eof @@ -606,21 +624,17 @@ istreamTable(const Handle& anchor, std::istream& in, const std::vector& ignore_features) { - // Infer the properties of the table without loading its content - bool has_header; - std::streampos beg = in.tellg(); - inferTableAttributes(in, ignore_features, tt, has_header); - in.seekg(beg); - - if (is_sparse) { - // fallback on the old loader - // TODO: this could definitely be optimized - OC_ASSERT(timestamp_feature.empty(), "Timestamp feature not implemented"); - return istreamTable_OLD(in, tab, target_feature, ignore_features); - } else { - return istreamDenseTable(in, tab, target_feature, timestamp_feature, - ignore_features, tt, has_header); - } + std::streampos beg = in.tellg(); + + // Infer the properties of the table without loading its content + bool has_header = false; + std::vector ignore_indexes; + std::vector col_types; + inferTableAttributes(in, ignore_features, ignore_indexes, + col_types, has_header); + in.seekg(beg); + + return istreamDenseTable(anchor, in, ignore_indexes, col_types, has_header); } // ================================================================== @@ -655,12 +669,10 @@ tokenizeRowIO ( // ================================================================== -static istream& -istreamDenseTable_noHeader(istream& in, Table& tab, - int target_idx, // < 0 == ignore - int timestamp_idx, // < 0 == ignore - const vector& ignore_idxs, - const type_tree& tt, bool has_header) +static std::istream& +istreamDenseTable(istream& in, Table& tab, + const vector& ignore_idxs, + const type_tree& tt, bool has_header) { // Get the entire dataset into memory (cleaning weird stuff) std::string line; @@ -732,64 +744,6 @@ istreamDenseTable_noHeader(istream& in, Table& tab, return in; } -istream& istreamDenseTable(istream& in, Table& tab, - const std::string& target_feature, - const std::string& timestamp_feature, - const vector& ignore_features, - const type_tree& tt, bool has_header) -{ - OC_ASSERT(has_header - || (target_feature.empty() - && ignore_features.empty() - && timestamp_feature.empty()), - "If the data file has no header, " - "then a target feature, ignore features or " - "timestamp_feature cannot be specified"); - - // determine target, timestamp and ignore indexes - int target_idx = 0; // if no header, target is at the first - // column by default - - int timestamp_idx = -1; // disabled by default - vector ignore_idxs; - if (has_header) { - std::string line; - get_data_line(in, line); - vector header = tokenizeRow(line); - - // Set target idx - if (!target_feature.empty()) { - auto target_it = std::find(header.begin(), header.end(), - target_feature); - OC_ASSERT(target_it != header.end(), "Target %s not found", - target_feature.c_str()); - target_idx = std::distance(header.begin(), target_it); - } - - // Set timestamp idx - if (!timestamp_feature.empty()) { - auto timestamp_it = std::find(header.begin(), header.end(), - timestamp_feature); - OC_ASSERT(timestamp_it != header.end(), "Timestamp feature %s not found", - timestamp_feature.c_str()); - timestamp_idx = std::distance(header.begin(), timestamp_it); - } - - // Set ignore idxs - ignore_idxs = get_indices(ignore_features, header); - - // get input and output labels from the header - auto iotlabels = tokenizeRowIOT(line, ignore_idxs, - target_idx, timestamp_idx); - tab.itable.set_labels(std::get<0>(iotlabels)); - tab.otable.set_label(std::get<1>(iotlabels)); - tab.ttable.set_label(std::get<2>(iotlabels)); - } - - return istreamDenseTable_noHeader(in, tab, target_idx, timestamp_idx, - ignore_idxs, tt, has_header); -} - #endif // ================================================================== @@ -805,7 +759,7 @@ void loadTable(const Handle& anchor, throw RuntimeException(TRACE_INFO, "Could not open %s", file_name.c_str()); - istreamTable(acnhro, in, ignore_features); + istreamTable(anchor, in, ignore_features); } // ================================================================== diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index 7b8164a84b..10c7e19be5 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -105,21 +105,10 @@ std::istream& istreamRawITable( std::istream& istreamITable(std::istream& in, ITable& tab, const string_seq& ignore_features); -std::istream& istreamTable(std::istream& in, Table& tab, +std::istream& istreamTable(const Handle& anchor, + std::istream& in, const string_seq& ignore_features); -// TODO: reimplement loadITable with the same model of loadTable and -// remove loadITable_optimized -Table loadITable( - const std::string& file_name, - const string_seq& ignore_features=string_seq()); - -Table loadITable_optimized( - const std::string& file_name, - const string_seq& ignore_features=string_seq()); - -/** - */ void loadTable(const Handle& anchor, const std::string& file_name, const string_seq& ignore_features=string_seq()); From 25241339bf53b42056b2cf2fbf20868100c4fcbb Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 20:34:22 +0300 Subject: [PATCH 20/56] Reorder order of teh code --- opencog/persist/csv/table_read.cc | 84 ++++++++++++++----------------- 1 file changed, 37 insertions(+), 47 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index c7d2dd253e..508d50af74 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -609,34 +609,6 @@ inferTableAttributes(std::istream& in, return in; } -/** - * Perform 2 passes: - * - * 1) Infer - * 1.1) its type - * 1.2) whether it has a header - * 1.3) whether it is dense or sparse - * - * 2) Load the actual data. - */ -std::istream& -istreamTable(const Handle& anchor, - std::istream& in, - const std::vector& ignore_features) -{ - std::streampos beg = in.tellg(); - - // Infer the properties of the table without loading its content - bool has_header = false; - std::vector ignore_indexes; - std::vector col_types; - inferTableAttributes(in, ignore_features, ignore_indexes, - col_types, has_header); - in.seekg(beg); - - return istreamDenseTable(anchor, in, ignore_indexes, col_types, has_header); -} - // ================================================================== #if 0 @@ -667,36 +639,26 @@ tokenizeRowIO ( return res; } +#endif + // ================================================================== static std::istream& -istreamDenseTable(istream& in, Table& tab, - const vector& ignore_idxs, - const type_tree& tt, bool has_header) +istreamDenseTable(const Handle& anchor, + std::istream& in, + const std::vector& ignore_idxs, + const std::vector& col_types, + bool has_header) { // Get the entire dataset into memory (cleaning weird stuff) std::string line; - std::vector lines; + std::vector lines; while (get_data_line(in, line)) lines.push_back(line); - // Allocate all rows in the itable, otable and ttable - tab.itable.resize(lines.size()); - tab.otable.resize(lines.size()); - if (timestamp_idx >= 0) - tab.ttable.resize(lines.size()); - // Get the elementary io types type_node_seq itypes = vector_comp(get_signature_inputs(tt), get_type_node); - type_node otype = get_type_node(get_signature_output(tt)); - - // Assign the io type to the table - tab.itable.set_types(itypes); - tab.otable.set_type(otype); - - // Instantiate type conversion for inputs - from_tokens_visitor ftv(itypes); // Function to parse each line (to be called in parallel) auto parse_line = [&](unsigned i) { @@ -744,7 +706,35 @@ istreamDenseTable(istream& in, Table& tab, return in; } -#endif +// ================================================================== + +/** + * Perform 2 passes: + * + * 1) Infer + * 1.1) its type + * 1.2) whether it has a header + * 1.3) whether it is dense or sparse + * + * 2) Load the actual data. + */ +std::istream& +istreamTable(const Handle& anchor, + std::istream& in, + const std::vector& ignore_features) +{ + std::streampos beg = in.tellg(); + + // Infer the properties of the table without loading its content + bool has_header = false; + std::vector ignore_indexes; + std::vector col_types; + inferTableAttributes(in, ignore_features, ignore_indexes, + col_types, has_header); + in.seekg(beg); + + return istreamDenseTable(anchor, in, ignore_indexes, col_types, has_header); +} // ================================================================== From c7b6ca9ab73904ec790fa09c111b16f69d86ffc6 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 20:39:54 +0300 Subject: [PATCH 21/56] Code that compiles. --- opencog/persist/csv/table_read.cc | 12 +++++------- opencog/persist/csv/table_read.h | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 508d50af74..febf61a03e 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -656,10 +656,7 @@ istreamDenseTable(const Handle& anchor, while (get_data_line(in, line)) lines.push_back(line); - // Get the elementary io types - type_node_seq itypes = - vector_comp(get_signature_inputs(tt), get_type_node); - +#if 0 // Function to parse each line (to be called in parallel) auto parse_line = [&](unsigned i) { try { @@ -702,6 +699,7 @@ istreamDenseTable(const Handle& anchor, if (timestamp_idx >= 0) tab.timestamp_pos = timestamp_idx - boost::count_if(ignore_idxs, arg1 < timestamp_idx); +#endif return in; } @@ -719,9 +717,9 @@ istreamDenseTable(const Handle& anchor, * 2) Load the actual data. */ std::istream& -istreamTable(const Handle& anchor, - std::istream& in, - const std::vector& ignore_features) +opencog::istreamTable(const Handle& anchor, + std::istream& in, + const std::vector& ignore_features) { std::streampos beg = in.tellg(); diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index 10c7e19be5..d03425b8a8 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -105,8 +105,8 @@ std::istream& istreamRawITable( std::istream& istreamITable(std::istream& in, ITable& tab, const string_seq& ignore_features); -std::istream& istreamTable(const Handle& anchor, - std::istream& in, +std::istream& istreamTable(const Handle&, + std::istream&, const string_seq& ignore_features); void loadTable(const Handle& anchor, From 70b2eaa407f06778c932daffed2e81a178138187 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 20:47:01 +0300 Subject: [PATCH 22/56] Remove unused code --- opencog/persist/csv/table_read.cc | 68 ++----------------------------- 1 file changed, 3 insertions(+), 65 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index febf61a03e..b29e3f0e8d 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -59,20 +59,6 @@ using boost::phoenix::arg_names::arg1; // ------------------------------------------------------- -/** - * Return true if the next chars in 'in' correspond to carriage return - * (support UNIX and DOS format) and advance in of the checked chars. - */ -static bool checkCarriageReturn(std::istream& in) -{ - char next_c = in.get(); - if (next_c == '\r') // DOS format - next_c = in.get(); - if (next_c == '\n') - return true; - return false; -} - /** * remove the carriage return (for DOS format) */ @@ -135,35 +121,6 @@ std::istream& get_data_line(std::istream& is, std::string& line) // ------------------------------------------------------- -static const char *sparse_delim = " : "; - -/** - * parse a pair of key/value in a parse dataset, using ':' as - * delimiter. For instance - * - * parse_key_val("key : val") - * - * returns - * - * {"key", "val"} - * - * If no such delimiter is found then it return a pair with empty key - * and empty val. - */ -static std::pair -parse_key_val(const std::string& chunk) -{ - std::pair res; - size_t pos = chunk.find(sparse_delim); - if (std::string::npos == pos) - return res; - std::string key = chunk.substr(0, pos); - boost::trim(key); - std::string val = chunk.substr(pos + strlen(sparse_delim)); - boost::trim(val); - return {key, val}; -} - /** * Take a row, return a tokenizer. Tokenization uses the * separator characters comma, blank, tab (',', ' ' or '\t'). @@ -322,6 +279,7 @@ ValuePtr opencog::token_to_vertex(Type tipe, const std::string& token) // =========================================================== // istream regular tables. +static const char *sparse_delim = " : "; /** * Fill the input table, given a file in DSV (delimiter-seperated values) @@ -424,33 +382,13 @@ std::vector infer_column_types(const std::vector& tab) return types; } -/** - * Infer the column types of the first line of a raw input table and - * compare it to the given column types. If there is a mis-match, - * then the first row must be a header, i.e. a set of ascii column - * labels. - */ -static bool has_header(ITable& tab, const std::vector& col_types) -{ - const string_seq& row = *tab.begin(); - - size_t arity = row.size(); - - for (size_t i=0; i& col_types) +static bool +is_header(const string_seq& tokens, const std::vector& col_types) { for (size_t i = 0; i < tokens.size(); i++) { From b4789caca68d8ffcc70823c8b2cae58402f908d8 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 20:57:40 +0300 Subject: [PATCH 23/56] Remove more dead code --- opencog/persist/csv/table_read.cc | 123 ++++++------------------------ 1 file changed, 23 insertions(+), 100 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index b29e3f0e8d..56ac84786e 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -407,54 +407,6 @@ std::vector get_header(const std::string& file_name) return tokenizeRow(line); } -#if 0 - -/** - * Fill the input table only, given a DSV (delimiter-seperated values) - * file format, where delimiters are ',', ' ' or '\t'. - * - * This algorithm makes several passes over the data. First, it reads - * the entire table, as a collection of strings. Next, it tries to - * infer the column types, and the presence of a header. - */ -std::istream& istreamITable(std::istream& in, ITable& tab, - const std::vector& ignore_features) -{ - istreamRawITable(in, tab); - - // Determine the column types. - std::vector col_types = infer_column_types(tab); - - // If there is a header row, then it must be the column labels. - if (has_header(tab, col_types)) - { - // tab.set_labels(*tab.begin()); - tab.erase(tab.begin()); - } - - // Now that we have some column labels to work off of, - // Get rid of the unwanted columns. - // tab.delete_columns(ignore_features); - - // determined ignore_indices - std::vector ignore_indices = get_indices(ignore_features, - get_header(file_name)); - - -.... - // Finally, perform a column type conversion - from_tokens_visitor ftv(tab.get_types()); - auto aft = apply_visitor(ftv); - OMP_ALGO::transform(tab.begin(), tab.end(), tab.begin(), - [&](multi_type_seq& seq) { - return aft(seq.get_variant()); - }); - - return in; -} - -#endif - // ================================================================== /** @@ -549,38 +501,6 @@ inferTableAttributes(std::istream& in, // ================================================================== -#if 0 -/** - * Take a line and return a pair with vector containing the input - * elements and then output element. - */ -template -std::pair, T> -tokenizeRowIO ( - const std::string& line, - const std::vector& ignored_indices=std::vector(), - unsigned target_idx=0) -{ - std::pair, T> res; - table_tokenizer toker = get_row_tokenizer(line); - size_t i = 0; - for (const std::string& tok : toker) { - if (!boost::binary_search(ignored_indices, i)) { - T el = boost::lexical_cast(tok); - if (target_idx == i) - res.second = el; - else - res.first.push_back(el); - } - i++; - } - return res; -} - -#endif - -// ================================================================== - static std::istream& istreamDenseTable(const Handle& anchor, std::istream& in, @@ -588,11 +508,29 @@ istreamDenseTable(const Handle& anchor, const std::vector& col_types, bool has_header) { - // Get the entire dataset into memory (cleaning weird stuff) - std::string line; - std::vector lines; - while (get_data_line(in, line)) - lines.push_back(line); + std::string line; + + // Assume the stream is at the begining. + // If there is a header, skip one line. + if (has_header) + get_data_line(in, line); + + // Loop over all lines in the table, one by one. + while (get_data_line(in, line)) + { + table_tokenizer toker = get_row_tokenizer(line); + size_t i = 0; + for (const std::string& tok : toker) { + if (!boost::binary_search(ignored_indices, i)) { + T el = boost::lexical_cast(tok); + if (target_idx == i) + res.second = el; + else + res.first.push_back(el); + } + i++; + } + } #if 0 // Function to parse each line (to be called in parallel) @@ -601,21 +539,6 @@ istreamDenseTable(const Handle& anchor, // Fill input auto tokenIOT = tokenizeRowIOT(lines[i], ignore_idxs, target_idx, timestamp_idx); - tab.itable[i] = ftv(std::get<0>(tokenIOT)); - - // Fill output - std::string output_str = std::get<1>(tokenIOT); - // If there is no valid target index, then there is no - // "output" column! - if ("" != output_str) - tab.otable[i] = token_to_vertex(otype, output_str); - - // Fill date - std::string date_str = std::get<2>(tokenIOT); - // If there is no valid timestamp index, then there is no - // "output" column! - if ("" != date_str) - tab.ttable[i] = TTable::from_string(date_str); } catch (AssertionException& ex) { unsigned lineno = has_header? i+1 : i; From 41a43a1b3af12eeaa4bb396b74c284a6feef2139 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 21:16:13 +0300 Subject: [PATCH 24/56] Prepare columns that will be filled in. --- opencog/persist/csv/table_read.cc | 36 ++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 56ac84786e..c712ee8adc 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -125,7 +125,7 @@ std::istream& get_data_line(std::istream& is, std::string& line) * Take a row, return a tokenizer. Tokenization uses the * separator characters comma, blank, tab (',', ' ' or '\t'). */ -table_tokenizer get_row_tokenizer(const std::string& line) +table_tokenizer opencog::get_row_tokenizer(const std::string& line) { typedef boost::escaped_list_separator separator; typedef boost::tokenizer tokenizer; @@ -508,6 +508,38 @@ istreamDenseTable(const Handle& anchor, const std::vector& col_types, bool has_header) { + // Width of table in the input. + size_t table_width = col_types.size(); + + // Effective width is the width, without the ignored columns. + size_t effective_width = table_width - ignore_idxs.size(); + + // Setup a mask; should we skip the column? + std::vector skip_col(table_width, false); + for (unsigned i : ignore_idxs) + skip_col[i] = true; + + // Set up typed columns. + std::vector> bool_cols; + std::vector> float_cols; + std::vector> string_cols; + + for (size_t ic = 0; ic < table_width; ic++) + { + if (skip_col[ic]) continue; + if (BOOL_VALUE == col_types[ic]) + bool_cols.push_back(std::vector()); + else + if (FLOAT_VALUE == col_types[ic]) + float_cols.push_back(std::vector()); + else + if (STRING_VALUE == col_types[ic]) + string_cols.push_back(std::vector()); + else + throw RuntimeException(TRACE_INFO, + "Unhandled column type"); + } + std::string line; // Assume the stream is at the begining. @@ -519,6 +551,7 @@ istreamDenseTable(const Handle& anchor, while (get_data_line(in, line)) { table_tokenizer toker = get_row_tokenizer(line); +#if 0 size_t i = 0; for (const std::string& tok : toker) { if (!boost::binary_search(ignored_indices, i)) { @@ -530,6 +563,7 @@ istreamDenseTable(const Handle& anchor, } i++; } +#endif } #if 0 From 635a4796d2d3dc34314273d0f84e44afe33714ea Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 21:27:49 +0300 Subject: [PATCH 25/56] Read boolean columns in the table --- opencog/persist/csv/table_read.cc | 105 ++++++++++++++---------------- 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index c712ee8adc..b5bf7979ce 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -230,53 +230,6 @@ infer_type_from_token2(Type curr_guess, const std::string& token) return STRING_VALUE; } -/// cast string "token" to a vertex of type "tipe" -static ValuePtr token_to_boolean(const std::string& token) -{ - if ("0" == token || "F" == token || "f" == token) - return createBoolValue(false); - - if ("1" == token || "T" == token || "t" == token) - return createBoolValue(true); - - throw SyntaxException(TRACE_INFO, - "Expecting boolean value, got %s", token.c_str()); -} - -static ValuePtr token_to_contin(const std::string& token) -{ - try { - return createFloatValue(lexical_cast(token)); - } catch (boost::bad_lexical_cast&) { - throw SyntaxException(TRACE_INFO, - "Could not cast %s to floating point", token.c_str()); - } -} - -ValuePtr opencog::token_to_vertex(Type tipe, const std::string& token) -{ - if (BOOL_VALUE == tipe) - return token_to_boolean(token); - - if (FLOAT_VALUE == tipe) - return token_to_contin(token); - - if (STRING_VALUE == tipe) - { - // Enum types must begin with an alpha character - if (isalpha(token[0])) - return createStringValue(token); - - throw SyntaxException(TRACE_INFO, - "Enum type must begin with alphabetic char, but %s doesn't", - token.c_str()); - } - - throw SyntaxException(TRACE_INFO, - "Unable to convert token \"%s\" to type=%d", - token.c_str(), tipe); -} - // =========================================================== // istream regular tables. static const char *sparse_delim = " : "; @@ -501,6 +454,30 @@ inferTableAttributes(std::istream& in, // ================================================================== +/// cast string "token" to a vertex of type "tipe" +static bool token_to_bool(const std::string& token) +{ + if ("0" == token || "F" == token || "f" == token) + return false; + + if ("1" == token || "T" == token || "t" == token) + return true; + + throw SyntaxException(TRACE_INFO, + "Expecting boolean value, got %s", token.c_str()); +} + +static double token_to_contin(const std::string& token) +{ + try { + return boost::lexical_cast(token); + } catch (boost::bad_lexical_cast&) { + throw SyntaxException(TRACE_INFO, + "Could not cast %s to floating point", token.c_str()); + } +} + + static std::istream& istreamDenseTable(const Handle& anchor, std::istream& in, @@ -551,19 +528,33 @@ istreamDenseTable(const Handle& anchor, while (get_data_line(in, line)) { table_tokenizer toker = get_row_tokenizer(line); + size_t ic = 0; + size_t bc = 0; + size_t fc = 0; + size_t sc = 0; + for (const std::string& tok : toker) + { + if (skip_col[ic]) { ic++; continue; } + if (BOOL_VALUE == col_types[ic]) + { + bool_cols[bc].push_back(token_to_bool(tok)); + bc ++; + ic ++; + continue; + } #if 0 - size_t i = 0; - for (const std::string& tok : toker) { - if (!boost::binary_search(ignored_indices, i)) { + else + if (FLOAT_VALUE == col_types[ic]) + float_cols.push_back(std::vector()); + else + if (STRING_VALUE == col_types[ic]) + string_cols.push_back(std::vector()); + +xxx T el = boost::lexical_cast(tok); - if (target_idx == i) - res.second = el; - else - res.first.push_back(el); - } - i++; - } + res.first.push_back(el); #endif + } } #if 0 From 02bb9bd5becd2e2fdec292aa83c827b2381e8add Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 21:30:33 +0300 Subject: [PATCH 26/56] Handle the remaining column types --- opencog/persist/csv/table_read.cc | 57 +++++++++---------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index b5bf7979ce..3b5d785fad 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -542,52 +542,29 @@ istreamDenseTable(const Handle& anchor, ic ++; continue; } -#if 0 - else + if (FLOAT_VALUE == col_types[ic]) - float_cols.push_back(std::vector()); - else + { + float_cols[fc].push_back(token_to_contin(tok)); + fc ++; + ic ++; + continue; + } + if (STRING_VALUE == col_types[ic]) - string_cols.push_back(std::vector()); + { + string_cols[sc].push_back(tok); + sc ++; + ic ++; + continue; + } -xxx - T el = boost::lexical_cast(tok); - res.first.push_back(el); -#endif + throw RuntimeException(TRACE_INFO, + "Unhandled column type"); } } -#if 0 - // Function to parse each line (to be called in parallel) - auto parse_line = [&](unsigned i) { - try { - // Fill input - auto tokenIOT = tokenizeRowIOT(lines[i], ignore_idxs, - target_idx, timestamp_idx); - } - catch (AssertionException& ex) { - unsigned lineno = has_header? i+1 : i; - OC_ASSERT(false, "Parsing error occurred on line %d of input file\n" - "Exception: %s", lineno, ex.what()); - } - }; - - // Call it for each line in parallel - auto ir = boost::irange((size_t)0, lines.size()); - vector row_idxs(ir.begin(), ir.end()); - OMP_ALGO::for_each(row_idxs.begin(), row_idxs.end(), parse_line); - - // Assign the target position relative to the ignored indices - // (useful for writing that file back) - tab.target_pos = target_idx - boost::count_if(ignore_idxs, - arg1 < target_idx); - - if (timestamp_idx >= 0) - tab.timestamp_pos = timestamp_idx - - boost::count_if(ignore_idxs, arg1 < timestamp_idx); -#endif - - return in; + return in; } // ================================================================== From 9c16a7530499b88552f489f66d7fd5dade0d2130 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 21:47:36 +0300 Subject: [PATCH 27/56] Stub out or remove dead code --- opencog/persist/csv/table_read.cc | 26 ++++++----------- opencog/persist/csv/table_read.h | 47 +++++++++++-------------------- 2 files changed, 25 insertions(+), 48 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 3b5d785fad..07d5f66179 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -26,16 +26,10 @@ #include #include -#include -#include +#include #include #include #include -#include -#include - -#include -#include #include #include @@ -53,10 +47,6 @@ using namespace opencog; -using namespace boost; -using namespace boost::phoenix; -using boost::phoenix::arg_names::arg1; - // ------------------------------------------------------- /** @@ -231,6 +221,8 @@ infer_type_from_token2(Type curr_guess, const std::string& token) } // =========================================================== +#ifdef NOT_USED_ANYWHERE + // istream regular tables. static const char *sparse_delim = " : "; @@ -275,14 +267,14 @@ std::istream& istreamRawITable(std::istream& in, ITable& tab, lines.push_back(line); // Determine the arity from the first line. - std::vector fl = tokenizeRow(lines[0], ignored_indices); + std::vector fl = tokenizeRow(lines[0]); size_t arity = fl.size(); std::atomic arity_fail_row(-1); auto parse_line = [&](size_t i) { // tokenize the line and fill the table with - tab[i] = tokenizeRow(lines[i], ignored_indices); + tab[i] = tokenizeRow(lines[i]); // Check arity if (arity != tab[i].size()) @@ -306,6 +298,7 @@ std::istream& istreamRawITable(std::istream& in, ITable& tab, } return in; } +#endif // NOT_USED_ANYWHERE // =========================================================== @@ -357,7 +350,7 @@ std::vector get_header(const std::string& file_name) std::ifstream in(file_name.c_str()); std::string line; get_data_line(in, line); - return tokenizeRow(line); + return tokenizeRow(line); } // ================================================================== @@ -402,8 +395,7 @@ inferTableAttributes(std::istream& in, lines.push_back(line); // Parse what could be a header - const std::vector maybe_header = - tokenizeRow(lines.front()); + const std::vector maybe_header = tokenizeRow(lines.front()); // Determine arity size_t arity = maybe_header.size(); @@ -417,7 +409,7 @@ inferTableAttributes(std::istream& in, for (size_t i = 1; i < lines.size(); ++i) { // Parse line - const string_seq& tokens = tokenizeRow(lines[i]); + const string_seq& tokens = tokenizeRow(lines[i]); // Check arity if (arity != tokens.size()) diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index d03425b8a8..7732ca3826 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -32,22 +32,12 @@ #include #include -#include -#include -#include #include #include namespace opencog { -/** - * Convert strings to typed values - */ -ValuePtr token_to_vertex(Type, const std::string&); - -// =========================================================== - typedef boost::tokenizer> table_tokenizer; /** @@ -59,28 +49,23 @@ table_tokenizer get_row_tokenizer(const std::string& line); /** * Take a line and return a vector containing the elements parsed. */ -template -static std::vector tokenizeRow ( - const std::string& line, - const std::vector& ignored_indices=std::vector()) +static std::vector tokenizeRow (const std::string& line) { - table_tokenizer tok = get_row_tokenizer(line); - std::vector res; - unsigned i = 0; - for (const std::string& t : tok) { - - // trim away whitespace padding; failing to do this - // confuses stuff downstream. - std::string clean(t); - boost::trim(clean); - - // Sometimes the tokenizer returns pure whitespace :-( - if (0 == clean.size()) continue; - - if (!boost::binary_search(ignored_indices, i++)) - res.push_back(boost::lexical_cast(clean)); - } - return res; + table_tokenizer tok = get_row_tokenizer(line); + std::vector res; + for (const std::string& t : tok) + { + // Trim away whitespace padding; failing to do this + // confuses stuff downstream. + std::string clean(t); + boost::trim(clean); + + // Sometimes the tokenizer returns pure whitespace :-( + if (0 == clean.size()) continue; + + res.push_back(clean); + } + return res; } // =========================================================== From 601a22685753fccee562006964198ca2d69b3cb4 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 21:52:11 +0300 Subject: [PATCH 28/56] More cleanup --- opencog/persist/csv/table_read.cc | 12 ++---------- opencog/persist/csv/table_read.h | 12 +++--------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 07d5f66179..434374afb3 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -50,7 +50,7 @@ using namespace opencog; // ------------------------------------------------------- /** - * remove the carriage return (for DOS format) + * Remove the carriage return (for DOS format). */ static void removeCarriageReturn(std::string& str) { @@ -60,7 +60,7 @@ static void removeCarriageReturn(std::string& str) } /** - * remove non ASCII char at the begining of the string + * Remove non ASCII char at the begining of the string. */ static void removeNonASCII(std::string& str) { @@ -345,14 +345,6 @@ is_header(const string_seq& tokens, const std::vector& col_types) return false; } -std::vector get_header(const std::string& file_name) -{ - std::ifstream in(file_name.c_str()); - std::string line; - get_data_line(in, line); - return tokenizeRow(line); -} - // ================================================================== /** diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index 7732ca3826..dd5b64bd14 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -80,15 +80,9 @@ typedef std::vector Table; // =========================================================== -// Get the header of a DSV file (assuming there is one) -string_seq get_header(const std::string& input_file); - -std::istream& istreamRawITable( - std::istream& in, ITable& table, - const std::vector& ignored_indices=std::vector()); - -std::istream& istreamITable(std::istream& in, ITable& tab, - const string_seq& ignore_features); +//std::istream& istreamRawITable( +// std::istream& in, ITable& table, +// const std::vector& ignored_indices=std::vector()); std::istream& istreamTable(const Handle&, std::istream&, From aab0dd5c6e3215267fc59cdac026eb92a48bc2dc Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 22:14:38 +0300 Subject: [PATCH 29/56] Start passing column names in --- opencog/persist/csv/table_read.cc | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 434374afb3..ff0ded4c2a 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -370,6 +370,7 @@ inferTableAttributes(std::istream& in, const std::vector& ignore_features, std::vector& ignore_idxs, std::vector& tt, + std::vector& maybe_header, bool& has_header) { has_header = false; @@ -383,11 +384,11 @@ inferTableAttributes(std::istream& in, // Get a portion of the dataset into memory (cleaning weird stuff) std::vector lines; std::string line; - while (get_data_line(in, line) && maxline-- > 0) + while (get_data_line(in, line) and 0 < maxline--) lines.push_back(line); // Parse what could be a header - const std::vector maybe_header = tokenizeRow(lines.front()); + maybe_header = tokenizeRow(lines.front()); // Determine arity size_t arity = maybe_header.size(); @@ -467,13 +468,14 @@ istreamDenseTable(const Handle& anchor, std::istream& in, const std::vector& ignore_idxs, const std::vector& col_types, + const std::vector& header, bool has_header) { // Width of table in the input. size_t table_width = col_types.size(); // Effective width is the width, without the ignored columns. - size_t effective_width = table_width - ignore_idxs.size(); + // size_t effective_width = table_width - ignore_idxs.size(); // Setup a mask; should we skip the column? std::vector skip_col(table_width, false); @@ -571,14 +573,25 @@ opencog::istreamTable(const Handle& anchor, std::streampos beg = in.tellg(); // Infer the properties of the table without loading its content - bool has_header = false; std::vector ignore_indexes; std::vector col_types; + std::vector header; + bool has_header = false; inferTableAttributes(in, ignore_features, ignore_indexes, - col_types, has_header); + col_types, header, has_header); + + // If the header is missing, then fake it. + if (not has_header) + { + header.clear(); + for (size_t i=0; i Date: Sat, 20 Aug 2022 22:28:24 +0300 Subject: [PATCH 30/56] Start placing the values on the anchor --- opencog/persist/csv/table_read.cc | 32 ++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index ff0ded4c2a..d7af4015cf 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -37,7 +37,7 @@ #include #include -#include +#include #include #include #include @@ -550,6 +550,36 @@ istreamDenseTable(const Handle& anchor, } } + // Now that we've read everything in, + // place the individual columns into the anchor atom. + AtomSpace* as = anchor->getAtomSpace(); + size_t bc = 0; + size_t fc = 0; + size_t sc = 0; + for (size_t ic = 0; ic < table_width; ic++) + { + if (skip_col[ic]) { ic++; continue; } + if (BOOL_VALUE == col_types[ic]) + { + Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic])); + ValuePtr bvp = createBoolValue(bool_cols[bc]); + as->set_value(anchor, key, bvp); + bc ++; + ic ++; + } +#if 0 + else + if (FLOAT_VALUE == col_types[ic]) + float_cols.push_back(std::vector()); + else + if (STRING_VALUE == col_types[ic]) + string_cols.push_back(std::vector()); + else +#endif + throw RuntimeException(TRACE_INFO, + "Unhandled column type"); + } + return in; } From d867b1e8f86d5791edae3b058bef0ffaa8dde486 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 22:35:56 +0300 Subject: [PATCH 31/56] Handle the other kinds of columns --- opencog/persist/csv/table_read.cc | 33 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index d7af4015cf..27d32230ec 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -559,25 +559,24 @@ istreamDenseTable(const Handle& anchor, for (size_t ic = 0; ic < table_width; ic++) { if (skip_col[ic]) { ic++; continue; } + + ValuePtr vp; if (BOOL_VALUE == col_types[ic]) - { - Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic])); - ValuePtr bvp = createBoolValue(bool_cols[bc]); - as->set_value(anchor, key, bvp); - bc ++; - ic ++; - } -#if 0 - else - if (FLOAT_VALUE == col_types[ic]) - float_cols.push_back(std::vector()); - else - if (STRING_VALUE == col_types[ic]) - string_cols.push_back(std::vector()); + vp = createBoolValue(bool_cols[bc++]); + + else if (FLOAT_VALUE == col_types[ic]) + vp = createFloatValue(float_cols[fc++]); + + else if (STRING_VALUE == col_types[ic]) + vp = createStringValue(string_cols[sc++]); + else -#endif - throw RuntimeException(TRACE_INFO, - "Unhandled column type"); + throw RuntimeException(TRACE_INFO, + "Unhandled column type"); + + Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic])); + as->set_value(anchor, key, vp); + ic ++; } return in; From 5bc0176287aecf9139ff937b9fa3dfe30db74e21 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 22:41:23 +0300 Subject: [PATCH 32/56] Add the list of keys to a well-known location --- opencog/persist/csv/table_read.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 27d32230ec..a9c82fb557 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -556,6 +557,7 @@ istreamDenseTable(const Handle& anchor, size_t bc = 0; size_t fc = 0; size_t sc = 0; + HandleSeq keylist; for (size_t ic = 0; ic < table_width; ic++) { if (skip_col[ic]) { ic++; continue; } @@ -576,9 +578,16 @@ istreamDenseTable(const Handle& anchor, Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic])); as->set_value(anchor, key, vp); + keylist.push_back(key); ic ++; } + // And finally, place a list of all the keys in a well-known + // location. + Handle klp = as->add_node(PREDICATE_NODE, std::string("*-column-keys-*")); + ValuePtr kvp = createLinkValue(keylist); + as->set_value(anchor, klp, kvp); + return in; } From 20edb093cf1f0d93242fb96cc9167b1f0bd9f2f6 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 22:43:33 +0300 Subject: [PATCH 33/56] More header cleanup --- opencog/persist/csv/table_read.cc | 2 ++ opencog/persist/csv/table_read.h | 5 ----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index a9c82fb557..59bd87468e 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -227,6 +227,8 @@ infer_type_from_token2(Type curr_guess, const std::string& token) // istream regular tables. static const char *sparse_delim = " : "; +typedef std::vector ITable; + /** * Fill the input table, given a file in DSV (delimiter-seperated values) * format. The delimiters are ',', ' ' or '\t'. diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index dd5b64bd14..eaf6e208d7 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -73,11 +73,6 @@ static std::vector tokenizeRow (const std::string& line) // TODO: Should this be a StringValue? typedef std::vector string_seq; -typedef std::vector ITable; - -// TODO Should this be a TableValue? -typedef std::vector Table; - // =========================================================== //std::istream& istreamRawITable( From 36a4ccbbbcba4692dfa46b1b3b14360b3a8696e9 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 22:53:01 +0300 Subject: [PATCH 34/56] Move stuff from header to c file --- opencog/persist/csv/table_read.cc | 37 ++++++++++++++----------------- opencog/persist/csv/table_read.h | 32 -------------------------- 2 files changed, 17 insertions(+), 52 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 59bd87468e..502a5827eb 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -112,11 +112,13 @@ std::istream& get_data_line(std::istream& is, std::string& line) // ------------------------------------------------------- +typedef boost::tokenizer> table_tokenizer; + /** * Take a row, return a tokenizer. Tokenization uses the * separator characters comma, blank, tab (',', ' ' or '\t'). */ -table_tokenizer opencog::get_row_tokenizer(const std::string& line) +static table_tokenizer get_row_tokenizer(const std::string& line) { typedef boost::escaped_list_separator separator; typedef boost::tokenizer tokenizer; @@ -126,29 +128,24 @@ table_tokenizer opencog::get_row_tokenizer(const std::string& line) return tokenizer(line, sep); } -// Same as above, but only allow commas as a column separator. -table_tokenizer get_sparse_row_tokenizer(const std::string& line) -{ - typedef boost::escaped_list_separator separator; - typedef boost::tokenizer tokenizer; - - // Tokenize line; currently, we allow tabs, commas, blanks. - static const separator sep("\\", ",", "\""); - return tokenizer(line, sep); -} - /** * Take a line and return a vector containing the elements parsed. - * Used by istreamTable. This will modify the line to remove leading - * non-ASCII characters, as well as stripping of any carriage-returns. */ -std::vector tokenizeSparseRow(const std::string& line) +static std::vector tokenizeRow (const std::string& line) { - table_tokenizer tok = get_sparse_row_tokenizer(line); + table_tokenizer tok = get_row_tokenizer(line); std::vector res; - for (std::string t : tok) { - boost::trim(t); - res.push_back(t); + for (const std::string& t : tok) + { + // Trim away whitespace padding; failing to do this + // confuses stuff downstream. + std::string clean(t); + boost::trim(clean); + + // Sometimes the tokenizer returns pure whitespace :-( + if (0 == clean.size()) continue; + + res.push_back(clean); } return res; } @@ -158,7 +155,7 @@ std::vector tokenizeSparseRow(const std::string& line) * Given an input string, guess the type of the string. * Inferable types are: boolean, contin and enum. */ -Type infer_type_from_token(const std::string& token) +static Type infer_type_from_token(const std::string& token) { /* Prefered representation is T's and 0's, to maximize clarity, * readability. Numeric values are easily confused with floating diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index eaf6e208d7..fdc6782204 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -38,38 +38,6 @@ namespace opencog { -typedef boost::tokenizer> table_tokenizer; - -/** - * Take a row, return a tokenizer. Tokenization uses the - * separator characters comma, blank, tab (',', ' ' or '\t'). - */ -table_tokenizer get_row_tokenizer(const std::string& line); - -/** - * Take a line and return a vector containing the elements parsed. - */ -static std::vector tokenizeRow (const std::string& line) -{ - table_tokenizer tok = get_row_tokenizer(line); - std::vector res; - for (const std::string& t : tok) - { - // Trim away whitespace padding; failing to do this - // confuses stuff downstream. - std::string clean(t); - boost::trim(clean); - - // Sometimes the tokenizer returns pure whitespace :-( - if (0 == clean.size()) continue; - - res.push_back(clean); - } - return res; -} - -// =========================================================== - // TODO: Should this be a StringValue? typedef std::vector string_seq; From e98033808d4297eeb7b4cf547d8a9ae6a6000edf Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 23:01:05 +0300 Subject: [PATCH 35/56] Add documentation --- opencog/persist/csv/table_read.cc | 42 ++++++++++++++++++++++++++++--- opencog/persist/csv/table_read.h | 12 +++------ 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 502a5827eb..a464e4727b 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -633,9 +633,45 @@ opencog::istreamTable(const Handle& anchor, // ================================================================== -void loadTable(const Handle& anchor, - const std::string& file_name, - const string_seq& ignore_features) +/** + * Load columns from a CSV file and place them into Atomese Values on + * the indicated Atom. Atomese Values are vectors (of floats, bools, + * srings, or more complex structures). Each Value holds one column + * from the dataset. + * + * The features (columns) specified in ignore_features will be omitted + * from the representation. + * + * For example, a CSV dataset like this: + * o, i1, i2, i3, i4 + * 1, 0, 0, 3.3, "foo" + * 0, 1, 0, 4.4, "bar" + * + * will be loaded as key-value pairs on the `anchor` Atom. + * + * First, at the "well known location" + * (Predicate "*-column-keys-*") + * there will be a list of all of the column-keys in the table: + * (LinkValue + * (Predicate "o") + * (Predicate "i1") + * (Predicate "i2") + * (Predicate "i3") + * (Predicate "i4")) + * + * Next, under each key, there will a column of values: + * (Predicate "o") (BoolValue 1 0) + * (Predicate "i1") (BoolValue 0 1) + * (Predicate "i2") (BoolValue 0 0) + * (Predicate "i3") (FloatValue 3.3 4.4) + * (Predicate "i4") (StringValue "foo" "bar") + * + * @param file_name + * @param ignore_features + */ +void load_cvs_table(const Handle& anchor, + const std::string& file_name, + const string_seq& ignore_features) { if (file_name.empty()) throw RuntimeException(TRACE_INFO, "The file name is empty!"); diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index fdc6782204..d83241a2b3 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -31,9 +31,6 @@ #include #include -#include -#include - #include namespace opencog { @@ -41,20 +38,19 @@ namespace opencog { // TODO: Should this be a StringValue? typedef std::vector string_seq; -// =========================================================== +void load_csv_table(const Handle& anchor, + const std::string& file_name, + const string_seq& ignore_features=string_seq()); //std::istream& istreamRawITable( // std::istream& in, ITable& table, // const std::vector& ignored_indices=std::vector()); +// Same as above, but works for an already-open stream. std::istream& istreamTable(const Handle&, std::istream&, const string_seq& ignore_features); -void loadTable(const Handle& anchor, - const std::string& file_name, - const string_seq& ignore_features=string_seq()); - } // ~namespaces opencog #endif // _ATOMESE_TABLE_READ_H From 44869c1050e64ec9ad67f24acb4c6825dd83eb97 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 23:02:43 +0300 Subject: [PATCH 36/56] Remove un-needed files --- opencog/persist/csv/CMakeLists.txt | 3 +- opencog/persist/csv/load_csv.cc | 60 ------------------------------ opencog/persist/csv/load_csv.h | 41 -------------------- 3 files changed, 1 insertion(+), 103 deletions(-) delete mode 100644 opencog/persist/csv/load_csv.cc delete mode 100644 opencog/persist/csv/load_csv.h diff --git a/opencog/persist/csv/CMakeLists.txt b/opencog/persist/csv/CMakeLists.txt index d358336b58..739333f0ff 100644 --- a/opencog/persist/csv/CMakeLists.txt +++ b/opencog/persist/csv/CMakeLists.txt @@ -1,7 +1,6 @@ # Generic JSON decoding. ADD_LIBRARY (csv - load_csv.cc table_read.cc ) @@ -18,7 +17,7 @@ INSTALL (TARGETS csv EXPORT AtomSpaceTargets ) INSTALL (FILES - load_csv.h + table_read.h DESTINATION "include/opencog/persist/csv" ) diff --git a/opencog/persist/csv/load_csv.cc b/opencog/persist/csv/load_csv.cc deleted file mode 100644 index 6c55f5f000..0000000000 --- a/opencog/persist/csv/load_csv.cc +++ /dev/null @@ -1,60 +0,0 @@ -/** - * load_csv.cc -- Load CSV tables into Values - * - * Copyright (C) 2022 Linas Vepstas - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License v3 as - * published by the Free Software Foundation and including the exceptions - * at http://opencog.org/wiki/Licenses - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program; if not, write to: - * Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include - -#include -#include "load_csv.h" - -using namespace opencog; - -/** - * Load columns from a CSV file and place them into Atomese Values on - * the indicated Atom. Atomese Values are vectors (of floats, bools, - * srings, or more complex structures). Each Value holds one column - * from the dataset. - * - * The features (columns) specified in ignore_features will be omitted - * from the representation. - * - * For example, a CSV dataset like this: - * o, i1, i2, i3, i4 - * 1, 0, 0, 3.3, "foo" - * 0, 1, 0, 4.4, "bar" - * - * will be loaded as the following key-value pairs on the `anchor` Atom: - * (Predicate "*-column-names-*") (StringValue "o", "i1", "i2", "i3", "i4") - * (Predicate "o") (BoolValue 1 0) - * (Predicate "i1") (BoolValue 0 1) - * (Predicate "i2") (BoolValue 0 0) - * (Predicate "i3") (FloatValue 3.3 4.4) - * (Predicate "i4") (StringValue "foo" "bar") - * - * @param file_name - * @param ignore_features - * @return - */ -void load_csv_table( - const Handle& anchor, - const std::string& file_name, - const std::vector& ignore_features) -{ -} diff --git a/opencog/persist/csv/load_csv.h b/opencog/persist/csv/load_csv.h deleted file mode 100644 index f073d6336b..0000000000 --- a/opencog/persist/csv/load_csv.h +++ /dev/null @@ -1,41 +0,0 @@ -/** - * load_csv.h -- Load CSV tables into Values - * - * Copyright (C) 2018 OpenCog Foundation - * Copyright (C) 2022 Linas Vepstas - * - * Author: Yidnekachew Wondimu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License v3 as - * published by the Free Software Foundation and including the exceptions - * at http://opencog.org/wiki/Licenses - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program; if not, write to: - * Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#ifndef _ATOMESE_LOAD_CSV_H -#define _ATOMESE_LOAD_CSV_H - -#include - -namespace opencog { - -// Load columns from a CSV file and place them into Atomese Values on -// the indicated Atom. See the .cc file for additional info. -void load_csv_table( - const Handle& anchor, - const std::string& file_name, - const std::vector& ignore_features=std::vector()); - -} // end namespace opencog - -#endif //_ATOMESE_LOAD_CSV_H From 829d9340197a24ab3e8acd6aebe83f4b9f02707e Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 23:10:23 +0300 Subject: [PATCH 37/56] Move documentation around --- opencog/persist/csv/table_read.cc | 52 ++++++++----------------------- opencog/persist/csv/table_read.h | 36 +++++++++++++++++++++ 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index a464e4727b..4778abba58 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -462,7 +462,9 @@ static double token_to_contin(const std::string& token) } } - +// See header file for `load_csv_table` for a general description +// of what is being done here. In breif, columns from a table +// are jammed into individual values on a given atom. static std::istream& istreamDenseTable(const Handle& anchor, std::istream& in, @@ -482,7 +484,7 @@ istreamDenseTable(const Handle& anchor, for (unsigned i : ignore_idxs) skip_col[i] = true; - // Set up typed columns. + // Set up typed columns. They're empty at first. std::vector> bool_cols; std::vector> float_cols; std::vector> string_cols; @@ -503,6 +505,7 @@ istreamDenseTable(const Handle& anchor, "Unhandled column type"); } + // ---------------------------------------------- std::string line; // Assume the stream is at the begining. @@ -511,6 +514,8 @@ istreamDenseTable(const Handle& anchor, get_data_line(in, line); // Loop over all lines in the table, one by one. + // Stuff the desired columns into each of the columns + // we created above. while (get_data_line(in, line)) { table_tokenizer toker = get_row_tokenizer(line); @@ -551,7 +556,11 @@ istreamDenseTable(const Handle& anchor, } // Now that we've read everything in, - // place the individual columns into the anchor atom. + // place the individual columns into Values, + // and then each value under's its column name, + // all of these on the anchor atom. + + // XXX TODO, we should probably take AtomSpace as an argument!? AtomSpace* as = anchor->getAtomSpace(); size_t bc = 0; size_t fc = 0; @@ -633,42 +642,7 @@ opencog::istreamTable(const Handle& anchor, // ================================================================== -/** - * Load columns from a CSV file and place them into Atomese Values on - * the indicated Atom. Atomese Values are vectors (of floats, bools, - * srings, or more complex structures). Each Value holds one column - * from the dataset. - * - * The features (columns) specified in ignore_features will be omitted - * from the representation. - * - * For example, a CSV dataset like this: - * o, i1, i2, i3, i4 - * 1, 0, 0, 3.3, "foo" - * 0, 1, 0, 4.4, "bar" - * - * will be loaded as key-value pairs on the `anchor` Atom. - * - * First, at the "well known location" - * (Predicate "*-column-keys-*") - * there will be a list of all of the column-keys in the table: - * (LinkValue - * (Predicate "o") - * (Predicate "i1") - * (Predicate "i2") - * (Predicate "i3") - * (Predicate "i4")) - * - * Next, under each key, there will a column of values: - * (Predicate "o") (BoolValue 1 0) - * (Predicate "i1") (BoolValue 0 1) - * (Predicate "i2") (BoolValue 0 0) - * (Predicate "i3") (FloatValue 3.3 4.4) - * (Predicate "i4") (StringValue "foo" "bar") - * - * @param file_name - * @param ignore_features - */ +// See header file for general description. void load_cvs_table(const Handle& anchor, const std::string& file_name, const string_seq& ignore_features) diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index d83241a2b3..c16d4fc2cb 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -38,6 +38,42 @@ namespace opencog { // TODO: Should this be a StringValue? typedef std::vector string_seq; +/** + * Load columns from a CSV file and place them into Atomese Values on + * the indicated Atom. Atomese Values are vectors (of floats, bools, + * srings, or more complex structures). Each Value holds one column + * from the dataset. + * + * The features (columns) specified in ignore_features will be omitted + * from the representation. + * + * For example, a CSV dataset like this: + * o, i1, i2, i3, i4 + * 1, 0, 0, 3.3, "foo" + * 0, 1, 0, 4.4, "bar" + * + * will be loaded as key-value pairs on the `anchor` Atom. + * + * First, at the "well known location" + * (Predicate "*-column-keys-*") + * there will be a list of all of the column-keys in the table: + * (LinkValue + * (Predicate "o") + * (Predicate "i1") + * (Predicate "i2") + * (Predicate "i3") + * (Predicate "i4")) + * + * Next, under each key, there will a column of values: + * (Predicate "o") (BoolValue 1 0) + * (Predicate "i1") (BoolValue 0 1) + * (Predicate "i2") (BoolValue 0 0) + * (Predicate "i3") (FloatValue 3.3 4.4) + * (Predicate "i4") (StringValue "foo" "bar") + * + * @param file_name + * @param ignore_features + */ void load_csv_table(const Handle& anchor, const std::string& file_name, const string_seq& ignore_features=string_seq()); From c15a912a76764eff4f44688bad57270d838e63e7 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 23:26:31 +0300 Subject: [PATCH 38/56] Add a README to explain what is going on --- opencog/persist/README.md | 3 +- opencog/persist/csv/README.md | 50 ++++++++++++++++++++++++++++++++ opencog/persist/csv/table_read.h | 2 +- 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 opencog/persist/csv/README.md diff --git a/opencog/persist/README.md b/opencog/persist/README.md index 81de0b5ef6..1e51b39928 100644 --- a/opencog/persist/README.md +++ b/opencog/persist/README.md @@ -17,7 +17,8 @@ Local subdirectories include: for RocksDB and one that allows AtomSpaces to trade Atoms over the network.) -* csv -- Load Values from CSV/TSV files. Each column in the CSV +* csv -- Load Values from CSV/TSV files. These are "delimiter + separated values" -- ordinary tables. Each column in the table is loaded into an appropriate Value (`FloatValue`, `BoolValue` or `StringValue`). The values are placed under keys (named after the column) on the provided Atom. diff --git a/opencog/persist/csv/README.md b/opencog/persist/csv/README.md new file mode 100644 index 0000000000..87a4f06ed7 --- /dev/null +++ b/opencog/persist/csv/README.md @@ -0,0 +1,50 @@ + +Load Ordinary CSV Tables +======================== +The code here is able to load "delimiter-separated values" (DSV, +or CSV, TSV for comma and tab separators) from a file. This are +just very conventional tables. + +Each column from a DSV file is read in and placed into an Atomese +Values on an indicated Atom. Atomese Values are vectors (of floats, +bools, strings). Each Value holds one column from the dataset. + +Basically, this just gets CSV data into the AtomSpace, where it +becomes easy for Atomese programs to act on them, i.e. to use them +as input for some kind of data stream processing. + +The features (columns) specified in ignore_features will be omitted +from the representation. + +Example +------- +For example, a CSV dataset like this: +``` + o, i1, i2, i3, i4 + 1, 0, 0, 3.3, "foo" + 0, 1, 0, 4.4, "bar" +``` +will be loaded as key-value pairs on the `anchor` Atom. + +The column names will be loaded under a "well known key": +``` + (Predicate "*-column-keys-*") +``` +This key will point at a value holding a list of all of the +column-keys in the table: +``` + (LinkValue + (Predicate "o") + (Predicate "i1") + (Predicate "i2") + (Predicate "i3") + (Predicate "i4")) +``` +Then, under each key, there will a column of values: +``` + (Predicate "o") (BoolValue 1 0) + (Predicate "i1") (BoolValue 0 1) + (Predicate "i2") (BoolValue 0 0) + (Predicate "i3") (FloatValue 3.3 4.4) + (Predicate "i4") (StringValue "foo" "bar") +``` diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index c16d4fc2cb..4df8031530 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -41,7 +41,7 @@ typedef std::vector string_seq; /** * Load columns from a CSV file and place them into Atomese Values on * the indicated Atom. Atomese Values are vectors (of floats, bools, - * srings, or more complex structures). Each Value holds one column + * strings, or more complex structures). Each Value holds one column * from the dataset. * * The features (columns) specified in ignore_features will be omitted From 7c8256c7b13a68fae9f8b355dca4ff911cbeb2f0 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 23:46:24 +0300 Subject: [PATCH 39/56] Start work on a unit test for CSV --- opencog/persist/csv/table_read.cc | 2 +- tests/persist/CMakeLists.txt | 1 + tests/persist/csv/CMakeLists.txt | 3 ++ tests/persist/csv/CSVLoadUTest.cxxtest | 65 ++++++++++++++++++++++++++ tests/persist/csv/simple.csv | 10 ++++ 5 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 tests/persist/csv/CMakeLists.txt create mode 100644 tests/persist/csv/CSVLoadUTest.cxxtest create mode 100644 tests/persist/csv/simple.csv diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 4778abba58..6f200f5485 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -592,7 +592,7 @@ istreamDenseTable(const Handle& anchor, // And finally, place a list of all the keys in a well-known // location. - Handle klp = as->add_node(PREDICATE_NODE, std::string("*-column-keys-*")); + Handle klp = as->add_node(PREDICATE_NODE, "*-column-keys-*"); ValuePtr kvp = createLinkValue(keylist); as->set_value(anchor, klp, kvp); diff --git a/tests/persist/CMakeLists.txt b/tests/persist/CMakeLists.txt index f97cfcbe3f..eff8ffb3fb 100644 --- a/tests/persist/CMakeLists.txt +++ b/tests/persist/CMakeLists.txt @@ -1,3 +1,4 @@ +ADD_SUBDIRECTORY (csv) ADD_SUBDIRECTORY (sexpr) ADD_SUBDIRECTORY (sql) ADD_SUBDIRECTORY (tlb) diff --git a/tests/persist/csv/CMakeLists.txt b/tests/persist/csv/CMakeLists.txt new file mode 100644 index 0000000000..ae0ac79321 --- /dev/null +++ b/tests/persist/csv/CMakeLists.txt @@ -0,0 +1,3 @@ +LINK_LIBRARIES(atomspace csv) + +ADD_CXXTEST(CSVLoadUTest) diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest new file mode 100644 index 0000000000..e4981364ae --- /dev/null +++ b/tests/persist/csv/CSVLoadUTest.cxxtest @@ -0,0 +1,65 @@ +/* + * CSVLoadUTest.cxxtest + * + * Copyright (c) 2022 Linas Vepstas + * SPDX-License-Identifier: AGPL-3.0-or-later + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + + +#include +#include +#include + +using namespace opencog; + +class CSVLoadUTest : public CxxTest::TestSuite { + +private: + AtomSpacePtr _asp; + +public: + CSVLoadUTest() { + logger().set_print_to_stdout_flag(true); + _asp = createAtomSpace(); + } + + void setUp() { _asp->clear(); } + + void tearDown() {} + + void test_simple_load(); +}; + +// Test load_csv_table +void CSVLoadUTest::test_simple_load() +{ + logger().info("BEGIN TEST: %s", __FUNCTION__); + + Handle h = _asp->add_node(CONCEPT_NODE, "foo"); + + load_csv_table(h, "simple.csv"); + + // There's the five columns, plus the table of contents. + HandleSet keys = h->getKeys(); + TS_ASSERT_EQUALS(6, keys.size()); + + Handle colkey = _asp->add_node(PREDICATE_NODE, "*-column-keys-*"); + ValuePtr kvp = h->getValue(colkey); + + logger().info("END TEST: %s", __FUNCTION__); +} diff --git a/tests/persist/csv/simple.csv b/tests/persist/csv/simple.csv new file mode 100644 index 0000000000..da3ab5c488 --- /dev/null +++ b/tests/persist/csv/simple.csv @@ -0,0 +1,10 @@ +# +# This is a simple demo CSV file +# It contains some comments, a column header +# and some data. +# +o, i1, i2, i3, i4 + +# Above was the column headers. Now the data. + 1, 0, 0, 3.3, "foo" + 0, 1, 0, 4.4, "bar" From 8171c146996abaec5f7a1d288eb8b5e57f7529d8 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sat, 20 Aug 2022 23:56:00 +0300 Subject: [PATCH 40/56] Fix typo in the name --- opencog/persist/csv/table_read.cc | 6 +++--- tests/persist/csv/CMakeLists.txt | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 6f200f5485..7046240654 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -643,9 +643,9 @@ opencog::istreamTable(const Handle& anchor, // ================================================================== // See header file for general description. -void load_cvs_table(const Handle& anchor, - const std::string& file_name, - const string_seq& ignore_features) +void opencog::load_csv_table(const Handle& anchor, + const std::string& file_name, + const string_seq& ignore_features) { if (file_name.empty()) throw RuntimeException(TRACE_INFO, "The file name is empty!"); diff --git a/tests/persist/csv/CMakeLists.txt b/tests/persist/csv/CMakeLists.txt index ae0ac79321..9c111c7d1d 100644 --- a/tests/persist/csv/CMakeLists.txt +++ b/tests/persist/csv/CMakeLists.txt @@ -1,3 +1,3 @@ -LINK_LIBRARIES(atomspace csv) +LINK_LIBRARIES(csv atomspace) ADD_CXXTEST(CSVLoadUTest) From a9c5b230c90d38af6c3076eb25a46ca5aeb325f3 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 00:17:22 +0300 Subject: [PATCH 41/56] Bug fix, failed to pass types along --- opencog/persist/csv/table_read.cc | 4 ++-- tests/persist/csv/CSVLoadUTest.cxxtest | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 7046240654..8429da0761 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -369,7 +369,7 @@ static std::istream& inferTableAttributes(std::istream& in, const std::vector& ignore_features, std::vector& ignore_idxs, - std::vector& tt, + std::vector& types, std::vector& maybe_header, bool& has_header) { @@ -395,7 +395,7 @@ inferTableAttributes(std::istream& in, std::atomic arity_fail_row(-1); // Determine initial type - std::vector types(arity, VOID_VALUE); + types.resize(arity, VOID_VALUE); // Parse the rest, determine its type and whether the arity is // consistent diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest index e4981364ae..480587d264 100644 --- a/tests/persist/csv/CSVLoadUTest.cxxtest +++ b/tests/persist/csv/CSVLoadUTest.cxxtest @@ -52,7 +52,8 @@ void CSVLoadUTest::test_simple_load() Handle h = _asp->add_node(CONCEPT_NODE, "foo"); - load_csv_table(h, "simple.csv"); + // Argh. Ugly. Fix. + load_csv_table(h, "../tests/persist/csv/simple.csv"); // There's the five columns, plus the table of contents. HandleSet keys = h->getKeys(); From 0a0f8a6b708e38189f968073b53377b9131930ce Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 00:22:52 +0300 Subject: [PATCH 42/56] nother bug fix --- opencog/persist/csv/table_read.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 8429da0761..6aebcb1dd0 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -518,12 +518,12 @@ istreamDenseTable(const Handle& anchor, // we created above. while (get_data_line(in, line)) { - table_tokenizer toker = get_row_tokenizer(line); size_t ic = 0; size_t bc = 0; size_t fc = 0; size_t sc = 0; - for (const std::string& tok : toker) + std::vector toks = tokenizeRow(line); + for (const std::string& tok : toks) { if (skip_col[ic]) { ic++; continue; } if (BOOL_VALUE == col_types[ic]) From eff2d58f6cb97f966c40c11e50aa3901a98ea681 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 00:30:42 +0300 Subject: [PATCH 43/56] Another bugfix --- opencog/persist/csv/table_read.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index 6aebcb1dd0..a111049343 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -568,7 +568,7 @@ istreamDenseTable(const Handle& anchor, HandleSeq keylist; for (size_t ic = 0; ic < table_width; ic++) { - if (skip_col[ic]) { ic++; continue; } + if (skip_col[ic]) continue; ValuePtr vp; if (BOOL_VALUE == col_types[ic]) @@ -587,7 +587,6 @@ istreamDenseTable(const Handle& anchor, Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic])); as->set_value(anchor, key, vp); keylist.push_back(key); - ic ++; } // And finally, place a list of all the keys in a well-known From f9272b8bd857b67295fb8bf5c9b1d4fdd3aab6af Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 00:41:54 +0300 Subject: [PATCH 44/56] Expand teh unit test some more --- tests/persist/csv/CSVLoadUTest.cxxtest | 28 ++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest index 480587d264..fa30e2d1b8 100644 --- a/tests/persist/csv/CSVLoadUTest.cxxtest +++ b/tests/persist/csv/CSVLoadUTest.cxxtest @@ -61,6 +61,34 @@ void CSVLoadUTest::test_simple_load() Handle colkey = _asp->add_node(PREDICATE_NODE, "*-column-keys-*"); ValuePtr kvp = h->getValue(colkey); + TS_ASSERT_EQUALS(5, kvp->size()); + + // Loop over the columns + LinkValuePtr lvp = LinkValueCast(kvp); + HandleSeq keylist = lvp->to_handle_seq(); + for (const Handle& key : keylist) + { + ValuePtr vp = h->getValue(key); + TS_ASSERT_EQUALS(2, vp->size()); + printf("Column %s is %s\n", key->to_short_string().c_str(), + vp->to_string().c_str()); + } + + // Loop over columns again, verify types. + int bc = 0; + int fc = 0; + int sc = 0; + for (const Handle& key : keylist) + { + ValuePtr vp = h->getValue(key); + Type vt = vp->get_type(); + if (BOOL_VALUE == vt) bc++; + if (FLOAT_VALUE == vt) fc++; + if (STRING_VALUE == vt) sc++; + } + TS_ASSERT_EQUALS(3, bc); + TS_ASSERT_EQUALS(1, fc); + TS_ASSERT_EQUALS(1, sc); logger().info("END TEST: %s", __FUNCTION__); } From be9286759112ed5330bac31c3544a3b19c16e6db Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 00:56:31 +0300 Subject: [PATCH 45/56] Add scheme bindings to the table loader --- opencog/persist/csv/CMakeLists.txt | 18 ++++++ opencog/persist/csv/TableSCM.cc | 88 ++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 opencog/persist/csv/TableSCM.cc diff --git a/opencog/persist/csv/CMakeLists.txt b/opencog/persist/csv/CMakeLists.txt index 739333f0ff..6cc3f7c35d 100644 --- a/opencog/persist/csv/CMakeLists.txt +++ b/opencog/persist/csv/CMakeLists.txt @@ -22,3 +22,21 @@ INSTALL (FILES ) # ------------------------------- + +ADD_LIBRARY (csv-table + TableSCM.cc +) + +TARGET_LINK_LIBRARIES(csv-table + csv + atomspace + smob +) + +ADD_GUILE_EXTENSION(SCM_CONFIG csv-table "opencog-ext-path-csv-table") + +INSTALL (TARGETS csv-table EXPORT AtomSpaceTargets + DESTINATION "lib${LIB_DIR_SUFFIX}/opencog" +) + +# ------------------------------- diff --git a/opencog/persist/csv/TableSCM.cc b/opencog/persist/csv/TableSCM.cc new file mode 100644 index 0000000000..489a89edb8 --- /dev/null +++ b/opencog/persist/csv/TableSCM.cc @@ -0,0 +1,88 @@ +/* + * opencog/persist/csv/TableSCM.cc + * + * Copyright (c) 2008 by OpenCog Foundation + * Copyright (c) 2008, 2009, 2013, 2015, 2022 Linas Vepstas + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _OPENCOG_CSV_TABLE_SCM_H +#define _OPENCOG_CSV_TABLE_SCM_H + +#include + +namespace opencog +{ +/** \addtogroup grp_persist + * @{ + */ + +class TableSCM : public ModuleWrap +{ +private: + void init(void); + + void load_table(const Handle&, const std::string&); +public: + TableSCM(void); +}; // class + +/** @}*/ +} // namespace + +extern "C" { +void opencog_persist_file_init(void); +}; + +#endif // _OPENCOG_CSV_TABLE_SCM_H + +#include +#include + +#include "table_read.h" + +using namespace opencog; + +TableSCM::TableSCM(void) + : ModuleWrap("opencog csv-table") +{ + static bool is_init = false; + if (is_init) return; + is_init = true; + module_init(); +} + +// Temporary(?) Hacky experimental API. Subject to change. +void TableSCM::init(void) +{ + define_scheme_primitive("load-table", + &TableSCM::load_table, this, "csv-table"); +} + +// ===================================================================== + +void TableSCM::load_table(const Handle& h, const std::string& path) +{ + // const AtomSpacePtr& as = SchemeSmob::ss_get_env_as("load-table"); + opencog::load_csv_table(h, path); +} + +void opencog_persist_file_init(void) +{ + static TableSCM patty; +} From 2b491d5cec75913b60cd60694379e0355174e652 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 01:03:39 +0300 Subject: [PATCH 46/56] Add the scm side of the csv-table module --- opencog/scm/CMakeLists.txt | 5 +++++ opencog/scm/opencog/csv-table.scm | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 opencog/scm/opencog/csv-table.scm diff --git a/opencog/scm/CMakeLists.txt b/opencog/scm/CMakeLists.txt index 868a9bcbbf..cb86d04524 100644 --- a/opencog/scm/CMakeLists.txt +++ b/opencog/scm/CMakeLists.txt @@ -18,6 +18,11 @@ ADD_GUILE_MODULE (FILES # Each of the files below are distinct modules. They need to be # compiled seperately. +ADD_GUILE_MODULE (FILES + opencog/csv-table.scm + COMPILE +) + ADD_GUILE_MODULE (FILES opencog/exec.scm DEPENDS exec diff --git a/opencog/scm/opencog/csv-table.scm b/opencog/scm/opencog/csv-table.scm new file mode 100644 index 0000000000..acbbea0afc --- /dev/null +++ b/opencog/scm/opencog/csv-table.scm @@ -0,0 +1,23 @@ +; +; OpenCog CSV Table Reader module +; + +(define-module (opencog csv-table)) + +(use-modules (opencog)) +(use-modules (opencog as-config)) +(load-extension + (string-append opencog-ext-path-csv-table "libcsv-table") + "opencog_csv_table_init") + +(export load-table) + +(set-procedure-property! load-table 'documentation +" + load-table ATOM FILE -- Load CSV/TSV table from FILE. + + Throws error if FILE does not exist. + More documentation TBD +") + +; -------------------------------------------------------------------- From b862c41548a4f5b5a73a2fd1f1a19ef3487d5e25 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 01:09:51 +0300 Subject: [PATCH 47/56] Bug fix cut-n-paste error --- opencog/persist/csv/TableSCM.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opencog/persist/csv/TableSCM.cc b/opencog/persist/csv/TableSCM.cc index 489a89edb8..67a3283e57 100644 --- a/opencog/persist/csv/TableSCM.cc +++ b/opencog/persist/csv/TableSCM.cc @@ -46,7 +46,7 @@ class TableSCM : public ModuleWrap } // namespace extern "C" { -void opencog_persist_file_init(void); +void opencog_csv_table_init(void); }; #endif // _OPENCOG_CSV_TABLE_SCM_H @@ -82,7 +82,7 @@ void TableSCM::load_table(const Handle& h, const std::string& path) opencog::load_csv_table(h, path); } -void opencog_persist_file_init(void) +void opencog_csv_table_init(void) { static TableSCM patty; } From 6bd2075795f162b5fc53b81cf58844cf3b95cb25 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 10:05:39 +0300 Subject: [PATCH 48/56] Specify file path correctly --- tests/persist/csv/CSVLoadUTest.cxxtest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest index fa30e2d1b8..c05a2f1914 100644 --- a/tests/persist/csv/CSVLoadUTest.cxxtest +++ b/tests/persist/csv/CSVLoadUTest.cxxtest @@ -53,7 +53,7 @@ void CSVLoadUTest::test_simple_load() Handle h = _asp->add_node(CONCEPT_NODE, "foo"); // Argh. Ugly. Fix. - load_csv_table(h, "../tests/persist/csv/simple.csv"); + load_csv_table(h, PROJECT_SOURCE_DIR "/tests/persist/csv/simple.csv"); // There's the five columns, plus the table of contents. HandleSet keys = h->getKeys(); From dbc24f0a330f6d2d1a66b8de05b980cf6f46b158 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 10:15:27 +0300 Subject: [PATCH 49/56] Mkae the AtoSpace an explicit argument --- opencog/persist/csv/TableSCM.cc | 4 ++-- opencog/persist/csv/table_read.cc | 15 ++++++++------- opencog/persist/csv/table_read.h | 8 +++++--- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/opencog/persist/csv/TableSCM.cc b/opencog/persist/csv/TableSCM.cc index 67a3283e57..7eaacd71dc 100644 --- a/opencog/persist/csv/TableSCM.cc +++ b/opencog/persist/csv/TableSCM.cc @@ -78,8 +78,8 @@ void TableSCM::init(void) void TableSCM::load_table(const Handle& h, const std::string& path) { - // const AtomSpacePtr& as = SchemeSmob::ss_get_env_as("load-table"); - opencog::load_csv_table(h, path); + const AtomSpacePtr& as = SchemeSmob::ss_get_env_as("load-table"); + opencog::load_csv_table(as, h, path); } void opencog_csv_table_init(void) diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc index a111049343..ae9186ec82 100644 --- a/opencog/persist/csv/table_read.cc +++ b/opencog/persist/csv/table_read.cc @@ -466,7 +466,8 @@ static double token_to_contin(const std::string& token) // of what is being done here. In breif, columns from a table // are jammed into individual values on a given atom. static std::istream& -istreamDenseTable(const Handle& anchor, +istreamDenseTable(const AtomSpacePtr& as, + const Handle& anchor, std::istream& in, const std::vector& ignore_idxs, const std::vector& col_types, @@ -560,8 +561,6 @@ istreamDenseTable(const Handle& anchor, // and then each value under's its column name, // all of these on the anchor atom. - // XXX TODO, we should probably take AtomSpace as an argument!? - AtomSpace* as = anchor->getAtomSpace(); size_t bc = 0; size_t fc = 0; size_t sc = 0; @@ -611,7 +610,8 @@ istreamDenseTable(const Handle& anchor, * 2) Load the actual data. */ std::istream& -opencog::istreamTable(const Handle& anchor, +opencog::istreamTable(const AtomSpacePtr& as, + const Handle& anchor, std::istream& in, const std::vector& ignore_features) { @@ -635,14 +635,15 @@ opencog::istreamTable(const Handle& anchor, in.seekg(beg); - return istreamDenseTable(anchor, in, ignore_indexes, + return istreamDenseTable(as, anchor, in, ignore_indexes, col_types, header, has_header); } // ================================================================== // See header file for general description. -void opencog::load_csv_table(const Handle& anchor, +void opencog::load_csv_table(const AtomSpacePtr& as, + const Handle& anchor, const std::string& file_name, const string_seq& ignore_features) { @@ -653,7 +654,7 @@ void opencog::load_csv_table(const Handle& anchor, throw RuntimeException(TRACE_INFO, "Could not open %s", file_name.c_str()); - istreamTable(anchor, in, ignore_features); + istreamTable(as, anchor, in, ignore_features); } // ================================================================== diff --git a/opencog/persist/csv/table_read.h b/opencog/persist/csv/table_read.h index 4df8031530..a4504fb519 100644 --- a/opencog/persist/csv/table_read.h +++ b/opencog/persist/csv/table_read.h @@ -31,7 +31,7 @@ #include #include -#include +#include namespace opencog { @@ -74,7 +74,8 @@ typedef std::vector string_seq; * @param file_name * @param ignore_features */ -void load_csv_table(const Handle& anchor, +void load_csv_table(const AtomSpacePtr&, + const Handle& anchor, const std::string& file_name, const string_seq& ignore_features=string_seq()); @@ -83,7 +84,8 @@ void load_csv_table(const Handle& anchor, // const std::vector& ignored_indices=std::vector()); // Same as above, but works for an already-open stream. -std::istream& istreamTable(const Handle&, +std::istream& istreamTable(const AtomSpacePtr&, + const Handle&, std::istream&, const string_seq& ignore_features); From eb8adb0acef1601ba2378af97bc8a8cb26886855 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 11:01:47 +0300 Subject: [PATCH 50/56] Start work on a table demo. --- examples/atomspace/table.csv | 22 ++++++++++++++++++++++ examples/atomspace/table.scm | 30 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 examples/atomspace/table.csv create mode 100644 examples/atomspace/table.scm diff --git a/examples/atomspace/table.csv b/examples/atomspace/table.csv new file mode 100644 index 0000000000..7e805e855a --- /dev/null +++ b/examples/atomspace/table.csv @@ -0,0 +1,22 @@ +# +# This is a simple demo CSV file. +# It contains a table of data, in comma-separated-value format. +# You can also use tab-separated values. +# +# This table contains a text column header. +# The column labels can be anything. +# If the header is absent, default labels will be generated. +# +b1, b2, b3, flt1, flt2, lbl + +# Now for some data. Three columns of binary numbers, +# Two floats, and one column of strings. + 0, 0, 1, 3.3, 4.4, "one" + 0, 0, 1, 4.4, 5.5, "one" + 0, 1, 1, 3.4, 6.5, "three" + 1, 0, 1, 2.4, 7.5, "five" + +# T and F are maybe better for binary ... + T, F, T, 4, 9, "five" + T, T, F, 5, 11, "six" + T, T, T, 2, 8.9, "seven" diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm new file mode 100644 index 0000000000..58096676a6 --- /dev/null +++ b/examples/atomspace/table.scm @@ -0,0 +1,30 @@ +; +; table.scm -- Formulas applied to Values from a CSV table. +; +; Similar to the flows.scm demo. +; +(use-modules (opencog) (opencog exec)) +(use-modules (opencog csv-table)) + +; Create an Atom on which the table will be located. +(define tab (Concept "My foo Table")) + +; Load the table (located in this directory.) +(load-table tab "table.csv") + +; Verify that the table loaded. First, take a look at all of the keys: +(cog-keys tab) + +; The ordered list of all the columns will be located at the +; "well-known predicate". All tables will have this; it is an +; ordered list of the columns in the table (in the same order +; as the file.) +(define colkeys (Predicate "*-column-keys-*")) +(cog-value tab colkeys) + +; Verify that the data for each column is present. +; Loop over the columns, and print the keys and values on them. +(for-each + (lambda (KEY) + (format #t "The key ~A holds data ~A\n" KEY (cog-value tab KEY))) + (cog-value->list (cog-value tab colkeys))) From e56a4af9424048c97e6e08e28b5910e4e86ba05e Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 11:14:31 +0300 Subject: [PATCH 51/56] Announce the demo --- examples/atomspace/README.md | 1 + examples/atomspace/table.scm | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/examples/atomspace/README.md b/examples/atomspace/README.md index 03aa741ab6..776a728940 100644 --- a/examples/atomspace/README.md +++ b/examples/atomspace/README.md @@ -51,6 +51,7 @@ first). * `values.scm` -- Using Values and attaching them to Atoms. * `stream.scm` -- Using a stream of time-varying Values. * `formulas.scm` -- Representing arithmetic and computing Values. +* `table.scm` -- Fetching Values from a CSV/TSV table. * `flows.scm` -- Flowing Values around. * `flow-formulas.scm` -- Dynamically updating value flows. * `multi-space.scm` -- Using multiple AtomSpaces at once. diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm index 58096676a6..58d8fe9590 100644 --- a/examples/atomspace/table.scm +++ b/examples/atomspace/table.scm @@ -1,7 +1,11 @@ ; -; table.scm -- Formulas applied to Values from a CSV table. +; table.scm -- Formulas applied to Values from a CSV/TSV table. ; -; Similar to the flows.scm demo. +; This is similar to the `flows.scm` demo, except that the values +; are feteched from a convetional DSV (delimiter-separated-value) +; table. The demo is in two parts. The first part reads the table, +; (a one-liner) and explores how it is represented in the AtomSpace. +; The second part applies some formulas to the table columns. ; (use-modules (opencog) (opencog exec)) (use-modules (opencog csv-table)) @@ -28,3 +32,18 @@ (lambda (KEY) (format #t "The key ~A holds data ~A\n" KEY (cog-value tab KEY))) (cog-value->list (cog-value tab colkeys))) +; +; ------------------------------------------------------------------- +; Part two: apply some formulas to the columns. +; +; Note that cog-value and cog-execute! ValueOf return the same thing: +(cog-value tab (PredicateNode "flt1")) +(cog-execute! (ValueOf tab (PredicateNode "flt1"))) + +(cog-execute! + (Minus + (ValueOf tab (PredicateNode "flt2")) + (ValueOf tab (PredicateNode "flt1")))) + +; That's all, folks. +; ------------------------------------------------------------------- From 577aa9bb9c69a401d02b3e6f5d9abfaf9ae4662d Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 11:39:42 +0300 Subject: [PATCH 52/56] Must use FloatValueOf not ValueOf --- examples/atomspace/README.md | 2 +- examples/atomspace/flows.scm | 4 ++-- examples/atomspace/table.scm | 27 ++++++++++++++++++++++++--- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/examples/atomspace/README.md b/examples/atomspace/README.md index 776a728940..66477924cc 100644 --- a/examples/atomspace/README.md +++ b/examples/atomspace/README.md @@ -51,9 +51,9 @@ first). * `values.scm` -- Using Values and attaching them to Atoms. * `stream.scm` -- Using a stream of time-varying Values. * `formulas.scm` -- Representing arithmetic and computing Values. -* `table.scm` -- Fetching Values from a CSV/TSV table. * `flows.scm` -- Flowing Values around. * `flow-formulas.scm` -- Dynamically updating value flows. +* `table.scm` -- Fetching Values from a CSV/TSV table. * `multi-space.scm` -- Using multiple AtomSpaces at once. After going through the above, go to the demos in the diff --git a/examples/atomspace/flows.scm b/examples/atomspace/flows.scm index 9bc65ac451..139776676f 100644 --- a/examples/atomspace/flows.scm +++ b/examples/atomspace/flows.scm @@ -140,7 +140,7 @@ ; Try out some math (cog-execute! (SetValue bar kee - (Times (ValueOf foo key) (ValueOf foo key)))) + (Times (FloatValueOf foo key) (FloatValueOf foo key)))) ; Verify (cog-execute! (ValueOf bar kee)) @@ -162,6 +162,6 @@ (cog-execute! (SetValue bar kee (DefinedSchema "triangle numbers") - (ValueOf foo key))) + (FloatValueOf foo key))) ; ; -------- THE END ----------- diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm index 58d8fe9590..604e8fc50b 100644 --- a/examples/atomspace/table.scm +++ b/examples/atomspace/table.scm @@ -36,14 +36,35 @@ ; ------------------------------------------------------------------- ; Part two: apply some formulas to the columns. ; -; Note that cog-value and cog-execute! ValueOf return the same thing: +; Note that `cog-value` and `cog-execute! ValueOf` return the same thing: (cog-value tab (PredicateNode "flt1")) (cog-execute! (ValueOf tab (PredicateNode "flt1"))) +; Take the difference of two columns. Note that `FloatValueOf` is +; used instead of `ValueOf`, so that the type-checking subsystem +; is happy about the types passed to the operator. (cog-execute! (Minus - (ValueOf tab (PredicateNode "flt2")) - (ValueOf tab (PredicateNode "flt1")))) + (FloatValueOf tab (PredicateNode "flt2")) + (FloatValueOf tab (PredicateNode "flt1")))) + +; The above can be wrapped into a function. Several examples follow, +; below. First, a function that takes the table as an argument, +; subtracts to columns, and places the result in a third column. +; The column names are hard-coded in the function. + +(DefineLink + (DefinedSchema "col diffs") + (Lambda + (Variable "$tbl-name") + (SetValue + (Variable "$tbl-name") (Predicate "f2 minus f1") + (Minus + (FloatValueOf (Variable "$tbl-name") (PredicateNode "flt2")) + (FloatValueOf (Variable "$tbl-name") (PredicateNode "flt1")))))) + +(cog-execute! (DefinedSchema "col diffs") tab) + ; That's all, folks. ; ------------------------------------------------------------------- From 9e61cc08c9e766545b668ee59779b287e32749ea Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 11:46:17 +0300 Subject: [PATCH 53/56] Update unit test to use the new API. --- tests/persist/csv/CSVLoadUTest.cxxtest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest index c05a2f1914..05e5a85322 100644 --- a/tests/persist/csv/CSVLoadUTest.cxxtest +++ b/tests/persist/csv/CSVLoadUTest.cxxtest @@ -53,7 +53,7 @@ void CSVLoadUTest::test_simple_load() Handle h = _asp->add_node(CONCEPT_NODE, "foo"); // Argh. Ugly. Fix. - load_csv_table(h, PROJECT_SOURCE_DIR "/tests/persist/csv/simple.csv"); + load_csv_table(_asp, h, PROJECT_SOURCE_DIR "/tests/persist/csv/simple.csv"); // There's the five columns, plus the table of contents. HandleSet keys = h->getKeys(); From f6df9940d1f618a39cbd73787c1b5738c9d92ec5 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 11:56:10 +0300 Subject: [PATCH 54/56] Provide a scoring function example. --- examples/atomspace/table.scm | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm index 604e8fc50b..800718b907 100644 --- a/examples/atomspace/table.scm +++ b/examples/atomspace/table.scm @@ -7,6 +7,8 @@ ; (a one-liner) and explores how it is represented in the AtomSpace. ; The second part applies some formulas to the table columns. ; +; The second part of the demo +; (use-modules (opencog) (opencog exec)) (use-modules (opencog csv-table)) @@ -63,8 +65,35 @@ (FloatValueOf (Variable "$tbl-name") (PredicateNode "flt2")) (FloatValueOf (Variable "$tbl-name") (PredicateNode "flt1")))))) -(cog-execute! (DefinedSchema "col diffs") tab) +; Apply the function to the table. +(cog-execute! (Put (DefinedSchema "col diffs") tab)) + +; Verify that the new column showed up. +(cog-keys tab) + +; .. and that it contains the expected data. +(cog-value tab (Predicate "f2 minus f1")) + +;-------- +; The AccumulateLink can be used to sum up all of the rows in a column. +(cog-execute! + (Accumulate (FloatValueOf tab (Predicate "f2 minus f1")))) + +; This can be turned into a simple scoring function. It computes the +; sum-total of the difference of two columns. This is a score, in that +; it is a single number that can be used as a utility function in +; conventional machine-learning algos. +(DefineLink + (DefinedSchema "compute score") + (Lambda + (Variable "$tbl-name") + (Accumulate + (Minus + (FloatValueOf (Variable "$tbl-name") (PredicateNode "flt2")) + (FloatValueOf (Variable "$tbl-name") (PredicateNode "flt1")))))) +; Apply the function to the table. +(cog-execute! (Put (DefinedSchema "compute score") tab)) ; That's all, folks. ; ------------------------------------------------------------------- From 1aec9200db8d1773b3d77ff3d3ef3bd993065192 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 12:02:25 +0300 Subject: [PATCH 55/56] Add explanation of the demo --- examples/atomspace/table.scm | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm index 800718b907..79c447c7ff 100644 --- a/examples/atomspace/table.scm +++ b/examples/atomspace/table.scm @@ -7,7 +7,18 @@ ; (a one-liner) and explores how it is represented in the AtomSpace. ; The second part applies some formulas to the table columns. ; -; The second part of the demo +; The second part of the demo is intereasting, because it shows how +; functions, written in Atomese, can be applied to tables, and how +; a "utility function" or a "scoring function" can be written. +; Utility functions are commonly used in machine learning, they +; provide a grand-total score that can be maximized or minized during +; training. The interesting point here is that the scoring function +; is represented in Atomese: it is some tree, some DAG of inputs. +; These trees can be randomly generated and mutated, thus allowing +; genetic-programming algorithms to be implemented in the AtomSpace. +; +; This is. of course, exactly what AS-MOSES does. This is effectively +; a demo of a sub-component of the AS-MOSES subsystem. ; (use-modules (opencog) (opencog exec)) (use-modules (opencog csv-table)) From 54f05f471dec32eeb36c546a1aa63a6a51bee4d3 Mon Sep 17 00:00:00 2001 From: Linas Vepstas Date: Sun, 21 Aug 2022 12:09:13 +0300 Subject: [PATCH 56/56] List additional modules. --- examples/atomspace/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/atomspace/README.md b/examples/atomspace/README.md index 66477924cc..8c42c2f6ff 100644 --- a/examples/atomspace/README.md +++ b/examples/atomspace/README.md @@ -193,6 +193,7 @@ everything else depends on. ``` (use-modules (opencog)) (use-modules (opencog atom-types)) +(use-modules (opencog csv-table)) (use-modules (opencog exec)) (use-modules (opencog logger)) (use-modules (opencog matrix)) @@ -202,9 +203,11 @@ everything else depends on. (use-modules (opencog persist-rocks)) (use-modules (opencog persist-sql)) (use-modules (opencog python)) +(use-modules (opencog randgen)) (use-modules (opencog sheaf)) (use-modules (opencog test-runner)) (use-modules (opencog type-utils)) +(use-modules (opencog uuid)) ``` There are other modules provided in other projects and repos. Here is