diff --git a/examples/atomspace/README.md b/examples/atomspace/README.md index 03aa741ab6..8c42c2f6ff 100644 --- a/examples/atomspace/README.md +++ b/examples/atomspace/README.md @@ -53,6 +53,7 @@ first). * `formulas.scm` -- Representing arithmetic and computing Values. * `flows.scm` -- Flowing Values around. * `flow-formulas.scm` -- Dynamically updating value flows. +* `table.scm` -- Fetching Values from a CSV/TSV table. * `multi-space.scm` -- Using multiple AtomSpaces at once. After going through the above, go to the demos in the @@ -192,6 +193,7 @@ everything else depends on. ``` (use-modules (opencog)) (use-modules (opencog atom-types)) +(use-modules (opencog csv-table)) (use-modules (opencog exec)) (use-modules (opencog logger)) (use-modules (opencog matrix)) @@ -201,9 +203,11 @@ everything else depends on. (use-modules (opencog persist-rocks)) (use-modules (opencog persist-sql)) (use-modules (opencog python)) +(use-modules (opencog randgen)) (use-modules (opencog sheaf)) (use-modules (opencog test-runner)) (use-modules (opencog type-utils)) +(use-modules (opencog uuid)) ``` There are other modules provided in other projects and repos. Here is diff --git a/examples/atomspace/flows.scm b/examples/atomspace/flows.scm index 9bc65ac451..139776676f 100644 --- a/examples/atomspace/flows.scm +++ b/examples/atomspace/flows.scm @@ -140,7 +140,7 @@ ; Try out some math (cog-execute! (SetValue bar kee - (Times (ValueOf foo key) (ValueOf foo key)))) + (Times (FloatValueOf foo key) (FloatValueOf foo key)))) ; Verify (cog-execute! (ValueOf bar kee)) @@ -162,6 +162,6 @@ (cog-execute! (SetValue bar kee (DefinedSchema "triangle numbers") - (ValueOf foo key))) + (FloatValueOf foo key))) ; ; -------- THE END ----------- diff --git a/examples/atomspace/table.csv b/examples/atomspace/table.csv new file mode 100644 index 0000000000..7e805e855a --- /dev/null +++ b/examples/atomspace/table.csv @@ -0,0 +1,22 @@ +# +# This is a simple demo CSV file. +# It contains a table of data, in comma-separated-value format. +# You can also use tab-separated values. +# +# This table contains a text column header. +# The column labels can be anything. +# If the header is absent, default labels will be generated. +# +b1, b2, b3, flt1, flt2, lbl + +# Now for some data. Three columns of binary numbers, +# Two floats, and one column of strings. + 0, 0, 1, 3.3, 4.4, "one" + 0, 0, 1, 4.4, 5.5, "one" + 0, 1, 1, 3.4, 6.5, "three" + 1, 0, 1, 2.4, 7.5, "five" + +# T and F are maybe better for binary ... + T, F, T, 4, 9, "five" + T, T, F, 5, 11, "six" + T, T, T, 2, 8.9, "seven" diff --git a/examples/atomspace/table.scm b/examples/atomspace/table.scm new file mode 100644 index 0000000000..79c447c7ff --- /dev/null +++ b/examples/atomspace/table.scm @@ -0,0 +1,110 @@ +; +; table.scm -- Formulas applied to Values from a CSV/TSV table. +; +; This is similar to the `flows.scm` demo, except that the values +; are feteched from a convetional DSV (delimiter-separated-value) +; table. The demo is in two parts. The first part reads the table, +; (a one-liner) and explores how it is represented in the AtomSpace. +; The second part applies some formulas to the table columns. +; +; The second part of the demo is intereasting, because it shows how +; functions, written in Atomese, can be applied to tables, and how +; a "utility function" or a "scoring function" can be written. +; Utility functions are commonly used in machine learning, they +; provide a grand-total score that can be maximized or minized during +; training. The interesting point here is that the scoring function +; is represented in Atomese: it is some tree, some DAG of inputs. +; These trees can be randomly generated and mutated, thus allowing +; genetic-programming algorithms to be implemented in the AtomSpace. +; +; This is. of course, exactly what AS-MOSES does. This is effectively +; a demo of a sub-component of the AS-MOSES subsystem. +; +(use-modules (opencog) (opencog exec)) +(use-modules (opencog csv-table)) + +; Create an Atom on which the table will be located. +(define tab (Concept "My foo Table")) + +; Load the table (located in this directory.) +(load-table tab "table.csv") + +; Verify that the table loaded. First, take a look at all of the keys: +(cog-keys tab) + +; The ordered list of all the columns will be located at the +; "well-known predicate". All tables will have this; it is an +; ordered list of the columns in the table (in the same order +; as the file.) +(define colkeys (Predicate "*-column-keys-*")) +(cog-value tab colkeys) + +; Verify that the data for each column is present. +; Loop over the columns, and print the keys and values on them. +(for-each + (lambda (KEY) + (format #t "The key ~A holds data ~A\n" KEY (cog-value tab KEY))) + (cog-value->list (cog-value tab colkeys))) +; +; ------------------------------------------------------------------- +; Part two: apply some formulas to the columns. +; +; Note that `cog-value` and `cog-execute! ValueOf` return the same thing: +(cog-value tab (PredicateNode "flt1")) +(cog-execute! (ValueOf tab (PredicateNode "flt1"))) + +; Take the difference of two columns. Note that `FloatValueOf` is +; used instead of `ValueOf`, so that the type-checking subsystem +; is happy about the types passed to the operator. +(cog-execute! + (Minus + (FloatValueOf tab (PredicateNode "flt2")) + (FloatValueOf tab (PredicateNode "flt1")))) + +; The above can be wrapped into a function. Several examples follow, +; below. First, a function that takes the table as an argument, +; subtracts to columns, and places the result in a third column. +; The column names are hard-coded in the function. + +(DefineLink + (DefinedSchema "col diffs") + (Lambda + (Variable "$tbl-name") + (SetValue + (Variable "$tbl-name") (Predicate "f2 minus f1") + (Minus + (FloatValueOf (Variable "$tbl-name") (PredicateNode "flt2")) + (FloatValueOf (Variable "$tbl-name") (PredicateNode "flt1")))))) + +; Apply the function to the table. +(cog-execute! (Put (DefinedSchema "col diffs") tab)) + +; Verify that the new column showed up. +(cog-keys tab) + +; .. and that it contains the expected data. +(cog-value tab (Predicate "f2 minus f1")) + +;-------- +; The AccumulateLink can be used to sum up all of the rows in a column. +(cog-execute! + (Accumulate (FloatValueOf tab (Predicate "f2 minus f1")))) + +; This can be turned into a simple scoring function. It computes the +; sum-total of the difference of two columns. This is a score, in that +; it is a single number that can be used as a utility function in +; conventional machine-learning algos. +(DefineLink + (DefinedSchema "compute score") + (Lambda + (Variable "$tbl-name") + (Accumulate + (Minus + (FloatValueOf (Variable "$tbl-name") (PredicateNode "flt2")) + (FloatValueOf (Variable "$tbl-name") (PredicateNode "flt1")))))) + +; Apply the function to the table. +(cog-execute! (Put (DefinedSchema "compute score") tab)) + +; That's all, folks. +; ------------------------------------------------------------------- diff --git a/opencog/atoms/value/README.md b/opencog/atoms/value/README.md index 95c2eb6ff0..60662eb7a8 100644 --- a/opencog/atoms/value/README.md +++ b/opencog/atoms/value/README.md @@ -94,4 +94,10 @@ Adding New Atom and Value Types Please see the [README-Adding-New-Atom-Types.md](../atom_types/README-Adding-New-Atom-Types.md) file. +See also the [Custom Types Example](../../../examples/type-system/README.md) +TODO +---- +* Perhaps add a TypeValue, which would be a vector of Types. If could + be useful as a kind-of table signature (for the csv table handling + code). diff --git a/opencog/persist/CMakeLists.txt b/opencog/persist/CMakeLists.txt index 9fd966d7bb..ea24c420ad 100644 --- a/opencog/persist/CMakeLists.txt +++ b/opencog/persist/CMakeLists.txt @@ -1,5 +1,6 @@ ADD_SUBDIRECTORY (storage) ADD_SUBDIRECTORY (api) +ADD_SUBDIRECTORY (csv) IF (HAVE_GEARMAN AND HAVE_GUILE) ADD_SUBDIRECTORY (gearman) diff --git a/opencog/persist/README.md b/opencog/persist/README.md index e40197a653..1e51b39928 100644 --- a/opencog/persist/README.md +++ b/opencog/persist/README.md @@ -17,6 +17,14 @@ Local subdirectories include: for RocksDB and one that allows AtomSpaces to trade Atoms over the network.) +* csv -- Load Values from CSV/TSV files. These are "delimiter + separated values" -- ordinary tables. Each column in the + table is loaded into an appropriate Value (`FloatValue`, + `BoolValue` or `StringValue`). The values are placed + under keys (named after the column) on the provided Atom. + This is intended for the ASMOSES subsystem, which + naturally operates on tables or streams of data. + * file -- Read and write files containing Atomese s-expressions. Provides both a `FileStorageNode`, and also some utilities to read files, and dump Atomspace contents to files or diff --git a/opencog/persist/csv/CMakeLists.txt b/opencog/persist/csv/CMakeLists.txt new file mode 100644 index 0000000000..6cc3f7c35d --- /dev/null +++ b/opencog/persist/csv/CMakeLists.txt @@ -0,0 +1,42 @@ + +# Generic JSON decoding. +ADD_LIBRARY (csv + table_read.cc +) + +ADD_DEPENDENCIES(csv opencog_atom_types) + +TARGET_LINK_LIBRARIES(csv + atomspace + atombase + ${COGUTIL_LIBRARY} +) + +INSTALL (TARGETS csv EXPORT AtomSpaceTargets + DESTINATION "lib${LIB_DIR_SUFFIX}/opencog" +) + +INSTALL (FILES + table_read.h + DESTINATION "include/opencog/persist/csv" +) + +# ------------------------------- + +ADD_LIBRARY (csv-table + TableSCM.cc +) + +TARGET_LINK_LIBRARIES(csv-table + csv + atomspace + smob +) + +ADD_GUILE_EXTENSION(SCM_CONFIG csv-table "opencog-ext-path-csv-table") + +INSTALL (TARGETS csv-table EXPORT AtomSpaceTargets + DESTINATION "lib${LIB_DIR_SUFFIX}/opencog" +) + +# ------------------------------- diff --git a/opencog/persist/csv/README.md b/opencog/persist/csv/README.md new file mode 100644 index 0000000000..87a4f06ed7 --- /dev/null +++ b/opencog/persist/csv/README.md @@ -0,0 +1,50 @@ + +Load Ordinary CSV Tables +======================== +The code here is able to load "delimiter-separated values" (DSV, +or CSV, TSV for comma and tab separators) from a file. This are +just very conventional tables. + +Each column from a DSV file is read in and placed into an Atomese +Values on an indicated Atom. Atomese Values are vectors (of floats, +bools, strings). Each Value holds one column from the dataset. + +Basically, this just gets CSV data into the AtomSpace, where it +becomes easy for Atomese programs to act on them, i.e. to use them +as input for some kind of data stream processing. + +The features (columns) specified in ignore_features will be omitted +from the representation. + +Example +------- +For example, a CSV dataset like this: +``` + o, i1, i2, i3, i4 + 1, 0, 0, 3.3, "foo" + 0, 1, 0, 4.4, "bar" +``` +will be loaded as key-value pairs on the `anchor` Atom. + +The column names will be loaded under a "well known key": +``` + (Predicate "*-column-keys-*") +``` +This key will point at a value holding a list of all of the +column-keys in the table: +``` + (LinkValue + (Predicate "o") + (Predicate "i1") + (Predicate "i2") + (Predicate "i3") + (Predicate "i4")) +``` +Then, under each key, there will a column of values: +``` + (Predicate "o") (BoolValue 1 0) + (Predicate "i1") (BoolValue 0 1) + (Predicate "i2") (BoolValue 0 0) + (Predicate "i3") (FloatValue 3.3 4.4) + (Predicate "i4") (StringValue "foo" "bar") +``` diff --git a/opencog/persist/csv/TableSCM.cc b/opencog/persist/csv/TableSCM.cc new file mode 100644 index 0000000000..7eaacd71dc --- /dev/null +++ b/opencog/persist/csv/TableSCM.cc @@ -0,0 +1,88 @@ +/* + * opencog/persist/csv/TableSCM.cc + * + * Copyright (c) 2008 by OpenCog Foundation + * Copyright (c) 2008, 2009, 2013, 2015, 2022 Linas Vepstas + * All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _OPENCOG_CSV_TABLE_SCM_H +#define _OPENCOG_CSV_TABLE_SCM_H + +#include + +namespace opencog +{ +/** \addtogroup grp_persist + * @{ + */ + +class TableSCM : public ModuleWrap +{ +private: + void init(void); + + void load_table(const Handle&, const std::string&); +public: + TableSCM(void); +}; // class + +/** @}*/ +} // namespace + +extern "C" { +void opencog_csv_table_init(void); +}; + +#endif // _OPENCOG_CSV_TABLE_SCM_H + +#include +#include + +#include "table_read.h" + +using namespace opencog; + +TableSCM::TableSCM(void) + : ModuleWrap("opencog csv-table") +{ + static bool is_init = false; + if (is_init) return; + is_init = true; + module_init(); +} + +// Temporary(?) Hacky experimental API. Subject to change. +void TableSCM::init(void) +{ + define_scheme_primitive("load-table", + &TableSCM::load_table, this, "csv-table"); +} + +// ===================================================================== + +void TableSCM::load_table(const Handle& h, const std::string& path) +{ + const AtomSpacePtr& as = SchemeSmob::ss_get_env_as("load-table"); + opencog::load_csv_table(as, h, path); +} + +void opencog_csv_table_init(void) +{ + static TableSCM patty; +} diff --git a/opencog/persist/csv/table_read.cc b/opencog/persist/csv/table_read.cc new file mode 100644 index 0000000000..ae9186ec82 --- /dev/null +++ b/opencog/persist/csv/table_read.cc @@ -0,0 +1,660 @@ +/** table_read.cc -- + * + * Copyright (C) 2010 OpenCog Foundation + * Copyright (C) 2012 Poulin Holdings LLC + * Copyright (C) 2022 Linas Vepstas + * + * Authors: Nil Geisweiller + * Linas Vepstas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "table_read.h" + +using namespace opencog; + +// ------------------------------------------------------- + +/** + * Remove the carriage return (for DOS format). + */ +static void removeCarriageReturn(std::string& str) +{ + size_t s = str.size(); + if ((s > 0) && (str[s-1] == '\r')) + str.resize(s-1); +} + +/** + * Remove non ASCII char at the begining of the string. + */ +static void removeNonASCII(std::string& str) +{ + while (str.size() && (unsigned char)str[0] > 127) + str = str.substr(1); +} + +// ------------------------------------------------------- +// Return true if the character is one of the standard comment +// delimiters. Here, we define a 'standard delimiter' as one +// of hash, bang or semicolon. +static bool is_comment(const char c) +{ + if ('#' == c) return true; + if (';' == c) return true; + if ('!' == c) return true; + if ('\n' == c) return true; + if ('\r' == c) return true; + if (0 == c) return true; + return false; +} + +/// Get one line of actual data. +/// This ignores lines that start with a 'standard comment char' +/// +// +// TODO: This routine should be extended so that comments that start +// somewhere other than column 0 are also ignored. +// +// The signature of this routine is the same as std:getline() +// +std::istream& get_data_line(std::istream& is, std::string& line) +{ + while (true) + { + getline(is, line); + if (!is) return is; + if (is_comment(line[0])) continue; + + // Remove weird symbols at the start of the line (only). + removeNonASCII(line); + // Remove carriage return at end of line (for DOS files). + removeCarriageReturn(line); + + return is; + } +} + +// ------------------------------------------------------- + +typedef boost::tokenizer> table_tokenizer; + +/** + * Take a row, return a tokenizer. Tokenization uses the + * separator characters comma, blank, tab (',', ' ' or '\t'). + */ +static table_tokenizer get_row_tokenizer(const std::string& line) +{ + typedef boost::escaped_list_separator separator; + typedef boost::tokenizer tokenizer; + + // Tokenize line; currently, we allow tabs, commas, blanks. + static const separator sep("\\", ",\t ", "\""); + return tokenizer(line, sep); +} + +/** + * Take a line and return a vector containing the elements parsed. + */ +static std::vector tokenizeRow (const std::string& line) +{ + table_tokenizer tok = get_row_tokenizer(line); + std::vector res; + for (const std::string& t : tok) + { + // Trim away whitespace padding; failing to do this + // confuses stuff downstream. + std::string clean(t); + boost::trim(clean); + + // Sometimes the tokenizer returns pure whitespace :-( + if (0 == clean.size()) continue; + + res.push_back(clean); + } + return res; +} + +// ------------------------------------------------------- +/** + * Given an input string, guess the type of the string. + * Inferable types are: boolean, contin and enum. + */ +static Type infer_type_from_token(const std::string& token) +{ + /* Prefered representation is T's and 0's, to maximize clarity, + * readability. Numeric values are easily confused with floating + * point type. + */ + if (token == "0" || + token == "1" || + token == "T" || + token == "F" || + token == "t" || + token == "f") + return BOOL_VALUE; + + // If it starts with an alphabetic character, assume its a string + else if (isalpha(token[0])) + return STRING_VALUE; + + // Hope that we can cast this to a float point number. + else { + try { + boost::lexical_cast(token); + return FLOAT_VALUE; + } + catch(...) { + return VOID_VALUE; + } + } +} + +/** + * Given an input string, guess the type of the string. + * Inferable types are: boolean, contin and enum. + * Compare this to 'curr_guess', and upgrade the type inference + * if it can be done consistently. + */ +static Type +infer_type_from_token2(Type curr_guess, const std::string& token) +{ + Type tokt = infer_type_from_token(token); + + // First time, just go with the flow. + if (VOID_VALUE == curr_guess) + return tokt; + + // Yayy! its consistent! + if (tokt == curr_guess) + return tokt; + + // If we saw 0,1 when expecting a contin, its a contin. + if ((FLOAT_VALUE == curr_guess) && (BOOL_VALUE == tokt)) + return curr_guess; + + // If we thought its a boolean 0,1 it might be a contin. + if ((BOOL_VALUE == curr_guess) && (FLOAT_VALUE == tokt)) + return tokt; + + // If we got to here, then there's some sort of unexpected + // inconsistency in the column types; we've got to presume that + // its just some crazy ascii string, i.e. enum_type. + return STRING_VALUE; +} + +// =========================================================== +#ifdef NOT_USED_ANYWHERE + +// istream regular tables. +static const char *sparse_delim = " : "; + +typedef std::vector ITable; + +/** + * Fill the input table, given a file in DSV (delimiter-seperated values) + * format. The delimiters are ',', ' ' or '\t'. + * + * It stuffs all data into the table as strings; type conversion to + * the appropriate type, and thunking for the header, and ignoring + * certain features, must all be done as a separate step. + */ +std::istream& istreamRawITable(std::istream& in, ITable& tab, + const std::vector& ignored_indices) +{ + std::streampos beg = in.tellg(); + + // Get the entire dataset into memory + std::string line; + std::vector lines; + + // Read first few by hand. The first might be labels, so we must + // get at least the second line. But the second line might have + // all default feature values (i.e. no colon), so get the third... + dorepeat(20) + { + if (!get_data_line(in, line)) + break; + + // If it is a sparse file, we are outta here. + // Throw an std::exception, since we don't want to log this as an + // error (all the other exception types log to the log file). + if (std::string::npos != line.find (sparse_delim)) + { + in.seekg(beg); + throw std::exception(); + } + lines.push_back(line); + } + + // Grab the rest of the file. + while (get_data_line(in, line)) + lines.push_back(line); + + // Determine the arity from the first line. + std::vector fl = tokenizeRow(lines[0]); + size_t arity = fl.size(); + + std::atomic arity_fail_row(-1); + auto parse_line = [&](size_t i) + { + // tokenize the line and fill the table with + tab[i] = tokenizeRow(lines[i]); + + // Check arity + if (arity != tab[i].size()) + arity_fail_row = i + 1; + }; + + // Vector of indices [0, lines.size()) + size_t ls = lines.size(); + tab.resize(ls); + auto ir = boost::irange((size_t)0, ls); + std::vector indices(ir.begin(), ir.end()); + OMP_ALGO::for_each(indices.begin(), indices.end(), parse_line); + + if (-1 != arity_fail_row) { + in.seekg(beg); + throw SyntaxException(TRACE_INFO, + "ERROR: Input file inconsistent: the %uth row has " + "a different number of columns than the rest of the file. " + "All rows should have the same number of columns.\n", + arity_fail_row.load()); + } + return in; +} +#endif // NOT_USED_ANYWHERE + +// =========================================================== + +/** + * Infer the column types of the input table. It is assumed the + * table's rows are vector of strings. + */ +std::vector infer_column_types(const std::vector& tab) +{ + std::vector::const_iterator rowit = tab.begin(); + + size_t arity = rowit->size(); + std::vector types(arity, VOID_VALUE); + + // Skip the first line, it might be a header... + // and that would confuse type inference. + if (tab.size() > 1) + ++rowit; + + // Loop over all rows; this performs a consistency check. + for (; rowit != tab.end(); ++rowit) + { + const string_seq& tokens = *rowit; + for (size_t i=0; i& col_types) +{ + for (size_t i = 0; i < tokens.size(); i++) + { + Type flt = infer_type_from_token2(col_types[i], tokens[i]); + if ((STRING_VALUE == flt) && (STRING_VALUE != col_types[i])) + return true; + } + return false; +} + +// ================================================================== + +/** + * Get indices (aka positions or offsets) of a list of labels given a + * header. The labels can be sequenced in any order, it will always + * return the order consistent with the header. + */ +static std::vector +get_indices(const string_seq &labels, + const string_seq &header) +{ + std::vector res; + for (size_t i = 0; i < header.size(); ++i) + if (std::find(labels.begin(), labels.end(), header[i]) != labels.end()) + res.push_back(i); + return res; +} + +// ================================================================== + +static std::istream& +inferTableAttributes(std::istream& in, + const std::vector& ignore_features, + std::vector& ignore_idxs, + std::vector& types, + std::vector& maybe_header, + bool& has_header) +{ + has_header = false; + + std::streampos beg = in.tellg(); + + // maxline is the maximum number of lines to read to infer the + // attributes. A negative number means reading all lines. + int maxline = 20; + + // Get a portion of the dataset into memory (cleaning weird stuff) + std::vector lines; + std::string line; + while (get_data_line(in, line) and 0 < maxline--) + lines.push_back(line); + + // Parse what could be a header + maybe_header = tokenizeRow(lines.front()); + + // Determine arity + size_t arity = maybe_header.size(); + std::atomic arity_fail_row(-1); + + // Determine initial type + types.resize(arity, VOID_VALUE); + + // Parse the rest, determine its type and whether the arity is + // consistent + for (size_t i = 1; i < lines.size(); ++i) + { + // Parse line + const string_seq& tokens = tokenizeRow(lines[i]); + + // Check arity + if (arity != tokens.size()) + { + arity_fail_row = i + 1; + in.seekg(beg); + in.clear(); // in case it has reached the eof + throw SyntaxException(TRACE_INFO, + "ERROR: Input file inconsistent: the %uth row has a " + "different number of columns than the rest of the file. " + "All rows should have the same number of columns.\n", + arity_fail_row.load()); + } + + // Infer type + boost::transform(types, tokens, types.begin(), + infer_type_from_token2); + } + + // Determine has_header + has_header = is_header(maybe_header, types); + + // Determine type signature + if (has_header) + { + ignore_idxs = get_indices(ignore_features, maybe_header); + boost::sort(ignore_idxs); + } + + in.seekg(beg); + in.clear(); // in case it has reached the eof + return in; +} + +// ================================================================== + +/// cast string "token" to a vertex of type "tipe" +static bool token_to_bool(const std::string& token) +{ + if ("0" == token || "F" == token || "f" == token) + return false; + + if ("1" == token || "T" == token || "t" == token) + return true; + + throw SyntaxException(TRACE_INFO, + "Expecting boolean value, got %s", token.c_str()); +} + +static double token_to_contin(const std::string& token) +{ + try { + return boost::lexical_cast(token); + } catch (boost::bad_lexical_cast&) { + throw SyntaxException(TRACE_INFO, + "Could not cast %s to floating point", token.c_str()); + } +} + +// See header file for `load_csv_table` for a general description +// of what is being done here. In breif, columns from a table +// are jammed into individual values on a given atom. +static std::istream& +istreamDenseTable(const AtomSpacePtr& as, + const Handle& anchor, + std::istream& in, + const std::vector& ignore_idxs, + const std::vector& col_types, + const std::vector& header, + bool has_header) +{ + // Width of table in the input. + size_t table_width = col_types.size(); + + // Effective width is the width, without the ignored columns. + // size_t effective_width = table_width - ignore_idxs.size(); + + // Setup a mask; should we skip the column? + std::vector skip_col(table_width, false); + for (unsigned i : ignore_idxs) + skip_col[i] = true; + + // Set up typed columns. They're empty at first. + std::vector> bool_cols; + std::vector> float_cols; + std::vector> string_cols; + + for (size_t ic = 0; ic < table_width; ic++) + { + if (skip_col[ic]) continue; + if (BOOL_VALUE == col_types[ic]) + bool_cols.push_back(std::vector()); + else + if (FLOAT_VALUE == col_types[ic]) + float_cols.push_back(std::vector()); + else + if (STRING_VALUE == col_types[ic]) + string_cols.push_back(std::vector()); + else + throw RuntimeException(TRACE_INFO, + "Unhandled column type"); + } + + // ---------------------------------------------- + std::string line; + + // Assume the stream is at the begining. + // If there is a header, skip one line. + if (has_header) + get_data_line(in, line); + + // Loop over all lines in the table, one by one. + // Stuff the desired columns into each of the columns + // we created above. + while (get_data_line(in, line)) + { + size_t ic = 0; + size_t bc = 0; + size_t fc = 0; + size_t sc = 0; + std::vector toks = tokenizeRow(line); + for (const std::string& tok : toks) + { + if (skip_col[ic]) { ic++; continue; } + if (BOOL_VALUE == col_types[ic]) + { + bool_cols[bc].push_back(token_to_bool(tok)); + bc ++; + ic ++; + continue; + } + + if (FLOAT_VALUE == col_types[ic]) + { + float_cols[fc].push_back(token_to_contin(tok)); + fc ++; + ic ++; + continue; + } + + if (STRING_VALUE == col_types[ic]) + { + string_cols[sc].push_back(tok); + sc ++; + ic ++; + continue; + } + + throw RuntimeException(TRACE_INFO, + "Unhandled column type"); + } + } + + // Now that we've read everything in, + // place the individual columns into Values, + // and then each value under's its column name, + // all of these on the anchor atom. + + size_t bc = 0; + size_t fc = 0; + size_t sc = 0; + HandleSeq keylist; + for (size_t ic = 0; ic < table_width; ic++) + { + if (skip_col[ic]) continue; + + ValuePtr vp; + if (BOOL_VALUE == col_types[ic]) + vp = createBoolValue(bool_cols[bc++]); + + else if (FLOAT_VALUE == col_types[ic]) + vp = createFloatValue(float_cols[fc++]); + + else if (STRING_VALUE == col_types[ic]) + vp = createStringValue(string_cols[sc++]); + + else + throw RuntimeException(TRACE_INFO, + "Unhandled column type"); + + Handle key = as->add_node(PREDICATE_NODE, std::string(header[ic])); + as->set_value(anchor, key, vp); + keylist.push_back(key); + } + + // And finally, place a list of all the keys in a well-known + // location. + Handle klp = as->add_node(PREDICATE_NODE, "*-column-keys-*"); + ValuePtr kvp = createLinkValue(keylist); + as->set_value(anchor, klp, kvp); + + return in; +} + +// ================================================================== + +/** + * Perform 2 passes: + * + * 1) Infer + * 1.1) its type + * 1.2) whether it has a header + * 1.3) whether it is dense or sparse + * + * 2) Load the actual data. + */ +std::istream& +opencog::istreamTable(const AtomSpacePtr& as, + const Handle& anchor, + std::istream& in, + const std::vector& ignore_features) +{ + std::streampos beg = in.tellg(); + + // Infer the properties of the table without loading its content + std::vector ignore_indexes; + std::vector col_types; + std::vector header; + bool has_header = false; + inferTableAttributes(in, ignore_features, ignore_indexes, + col_types, header, has_header); + + // If the header is missing, then fake it. + if (not has_header) + { + header.clear(); + for (size_t i=0; i + * Linas Vepstas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _ATOMESE_TABLE_READ_H +#define _ATOMESE_TABLE_READ_H + +#include +#include +#include + +#include + +namespace opencog { + +// TODO: Should this be a StringValue? +typedef std::vector string_seq; + +/** + * Load columns from a CSV file and place them into Atomese Values on + * the indicated Atom. Atomese Values are vectors (of floats, bools, + * strings, or more complex structures). Each Value holds one column + * from the dataset. + * + * The features (columns) specified in ignore_features will be omitted + * from the representation. + * + * For example, a CSV dataset like this: + * o, i1, i2, i3, i4 + * 1, 0, 0, 3.3, "foo" + * 0, 1, 0, 4.4, "bar" + * + * will be loaded as key-value pairs on the `anchor` Atom. + * + * First, at the "well known location" + * (Predicate "*-column-keys-*") + * there will be a list of all of the column-keys in the table: + * (LinkValue + * (Predicate "o") + * (Predicate "i1") + * (Predicate "i2") + * (Predicate "i3") + * (Predicate "i4")) + * + * Next, under each key, there will a column of values: + * (Predicate "o") (BoolValue 1 0) + * (Predicate "i1") (BoolValue 0 1) + * (Predicate "i2") (BoolValue 0 0) + * (Predicate "i3") (FloatValue 3.3 4.4) + * (Predicate "i4") (StringValue "foo" "bar") + * + * @param file_name + * @param ignore_features + */ +void load_csv_table(const AtomSpacePtr&, + const Handle& anchor, + const std::string& file_name, + const string_seq& ignore_features=string_seq()); + +//std::istream& istreamRawITable( +// std::istream& in, ITable& table, +// const std::vector& ignored_indices=std::vector()); + +// Same as above, but works for an already-open stream. +std::istream& istreamTable(const AtomSpacePtr&, + const Handle&, + std::istream&, + const string_seq& ignore_features); + +} // ~namespaces opencog + +#endif // _ATOMESE_TABLE_READ_H diff --git a/opencog/scm/CMakeLists.txt b/opencog/scm/CMakeLists.txt index 868a9bcbbf..cb86d04524 100644 --- a/opencog/scm/CMakeLists.txt +++ b/opencog/scm/CMakeLists.txt @@ -18,6 +18,11 @@ ADD_GUILE_MODULE (FILES # Each of the files below are distinct modules. They need to be # compiled seperately. +ADD_GUILE_MODULE (FILES + opencog/csv-table.scm + COMPILE +) + ADD_GUILE_MODULE (FILES opencog/exec.scm DEPENDS exec diff --git a/opencog/scm/opencog/csv-table.scm b/opencog/scm/opencog/csv-table.scm new file mode 100644 index 0000000000..acbbea0afc --- /dev/null +++ b/opencog/scm/opencog/csv-table.scm @@ -0,0 +1,23 @@ +; +; OpenCog CSV Table Reader module +; + +(define-module (opencog csv-table)) + +(use-modules (opencog)) +(use-modules (opencog as-config)) +(load-extension + (string-append opencog-ext-path-csv-table "libcsv-table") + "opencog_csv_table_init") + +(export load-table) + +(set-procedure-property! load-table 'documentation +" + load-table ATOM FILE -- Load CSV/TSV table from FILE. + + Throws error if FILE does not exist. + More documentation TBD +") + +; -------------------------------------------------------------------- diff --git a/tests/persist/CMakeLists.txt b/tests/persist/CMakeLists.txt index f97cfcbe3f..eff8ffb3fb 100644 --- a/tests/persist/CMakeLists.txt +++ b/tests/persist/CMakeLists.txt @@ -1,3 +1,4 @@ +ADD_SUBDIRECTORY (csv) ADD_SUBDIRECTORY (sexpr) ADD_SUBDIRECTORY (sql) ADD_SUBDIRECTORY (tlb) diff --git a/tests/persist/csv/CMakeLists.txt b/tests/persist/csv/CMakeLists.txt new file mode 100644 index 0000000000..9c111c7d1d --- /dev/null +++ b/tests/persist/csv/CMakeLists.txt @@ -0,0 +1,3 @@ +LINK_LIBRARIES(csv atomspace) + +ADD_CXXTEST(CSVLoadUTest) diff --git a/tests/persist/csv/CSVLoadUTest.cxxtest b/tests/persist/csv/CSVLoadUTest.cxxtest new file mode 100644 index 0000000000..05e5a85322 --- /dev/null +++ b/tests/persist/csv/CSVLoadUTest.cxxtest @@ -0,0 +1,94 @@ +/* + * CSVLoadUTest.cxxtest + * + * Copyright (c) 2022 Linas Vepstas + * SPDX-License-Identifier: AGPL-3.0-or-later + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License v3 as + * published by the Free Software Foundation and including the exceptions + * at http://opencog.org/wiki/Licenses + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program; if not, write to: + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + + +#include +#include +#include + +using namespace opencog; + +class CSVLoadUTest : public CxxTest::TestSuite { + +private: + AtomSpacePtr _asp; + +public: + CSVLoadUTest() { + logger().set_print_to_stdout_flag(true); + _asp = createAtomSpace(); + } + + void setUp() { _asp->clear(); } + + void tearDown() {} + + void test_simple_load(); +}; + +// Test load_csv_table +void CSVLoadUTest::test_simple_load() +{ + logger().info("BEGIN TEST: %s", __FUNCTION__); + + Handle h = _asp->add_node(CONCEPT_NODE, "foo"); + + // Argh. Ugly. Fix. + load_csv_table(_asp, h, PROJECT_SOURCE_DIR "/tests/persist/csv/simple.csv"); + + // There's the five columns, plus the table of contents. + HandleSet keys = h->getKeys(); + TS_ASSERT_EQUALS(6, keys.size()); + + Handle colkey = _asp->add_node(PREDICATE_NODE, "*-column-keys-*"); + ValuePtr kvp = h->getValue(colkey); + TS_ASSERT_EQUALS(5, kvp->size()); + + // Loop over the columns + LinkValuePtr lvp = LinkValueCast(kvp); + HandleSeq keylist = lvp->to_handle_seq(); + for (const Handle& key : keylist) + { + ValuePtr vp = h->getValue(key); + TS_ASSERT_EQUALS(2, vp->size()); + printf("Column %s is %s\n", key->to_short_string().c_str(), + vp->to_string().c_str()); + } + + // Loop over columns again, verify types. + int bc = 0; + int fc = 0; + int sc = 0; + for (const Handle& key : keylist) + { + ValuePtr vp = h->getValue(key); + Type vt = vp->get_type(); + if (BOOL_VALUE == vt) bc++; + if (FLOAT_VALUE == vt) fc++; + if (STRING_VALUE == vt) sc++; + } + TS_ASSERT_EQUALS(3, bc); + TS_ASSERT_EQUALS(1, fc); + TS_ASSERT_EQUALS(1, sc); + + logger().info("END TEST: %s", __FUNCTION__); +} diff --git a/tests/persist/csv/simple.csv b/tests/persist/csv/simple.csv new file mode 100644 index 0000000000..da3ab5c488 --- /dev/null +++ b/tests/persist/csv/simple.csv @@ -0,0 +1,10 @@ +# +# This is a simple demo CSV file +# It contains some comments, a column header +# and some data. +# +o, i1, i2, i3, i4 + +# Above was the column headers. Now the data. + 1, 0, 0, 3.3, "foo" + 0, 1, 0, 4.4, "bar"