MonetDBSolutions · gijzelaerr · Dec 29, 2021 · Dec 21, 2021 · Dec 22, 2021 · Dec 22, 2021
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1,2 @@
-include monetdbe/_cffi/embed.h.j2
+include monetdbe/_cffi/embed.h.j2
+include monetdbe/_cffi/native_utilities.c
diff --git a/Makefile b/Makefile
@@ -12,16 +12,16 @@ venv/:
 	python3 -m venv venv
 	venv/bin/pip install --upgrade pip wheel build setuptools
 
-venv/installed: venv/
+venv/bin/pytest: venv/
 	venv/bin/pip install -e ".[test]"
-	touch venv/installed
+	touch venv/bin/pytest
 
-setup: venv/installed
+setup: venv/bin/pytest
 
-build: venv/installed
+build: venv/bin/pytest
 	venv/bin/pyproject-build
 
-test: setup
+test: venv/bin/pytest
 	venv/bin/pytest
 
 docker-wheels:
@@ -91,6 +91,12 @@ twine: venv/bin/twine
 info: setup
 	venv/bin/python -c "from monetdbe._cffi.util import print_info; print_info()"
 
+venv/bin/ipython: venv/
+	venv/bin/pip install ipython
+
+ipython: venv/bin/ipython
+	venv/bin/ipython
+
 venv38/:
 	/opt/homebrew/Cellar/[email protected]/3.8.*/bin/python3.8 -m venv venv38
 

diff --git a/monetdbe/_cffi/builder.py b/monetdbe/_cffi/builder.py
@@ -25,15 +25,12 @@
 default = monetdb_branch.lower() in ("default", "jul2021", "jan2022")
 win32 = platform == 'win32'
 
-
-source = """
-#include "monetdb/monetdbe.h"
-"""
-
+with open(Path(__file__).resolve().parent / "native_utilities.c") as f:
+    source = f.read()
 
 # the ffibuilder object needs to exist and be configured in the module namespace so setup.py can reach it
 ffibuilder = FFI()
-ffibuilder.set_source("monetdbe._lowlevel", source, libraries=['monetdbe'])
+ffibuilder.set_source("monetdbe._lowlevel", source=source, libraries=['monetdbe'])
 embed_path = str(Path(__file__).resolve().parent / 'embed.h.j2')
 
 

diff --git a/monetdbe/_cffi/convert/__init__.py b/monetdbe/_cffi/convert/__init__.py
@@ -1,6 +1,6 @@
 from decimal import Decimal
-from typing import List, Optional, Callable, Union, Any, Mapping
-from typing import NamedTuple
+from typing import List, Optional, Callable, Union, Any, Mapping, NamedTuple
+import logging
 
 import numpy as np
 
@@ -9,6 +9,11 @@
 from monetdbe.converters import converters
 from monetdbe.exceptions import ProgrammingError
 from monetdbe.pythonize import py_date, py_time, py_timestamp
+from monetdbe._cffi.branch import newer_then_jul2021
+
+from monetdbe.types import supported_numpy_types
+
+_logger = logging.getLogger()
 
 
 def make_string(blob: char_p) -> str:
@@ -46,34 +51,63 @@ class MonetdbTypeInfo(NamedTuple):
     null_value: Optional[Union[int, float]]
 
 
-#  monetdb C type, SQL type, numpy type, Cstringtype, pyconverter, null value, comment
-type_infos: List[MonetdbTypeInfo] = [
+inversable_type_infos: List[MonetdbTypeInfo] = [
     MonetdbTypeInfo(lib.monetdbe_bool, "boolean", np.dtype(np.bool_), "bool", None, None),
     MonetdbTypeInfo(lib.monetdbe_int8_t, "tinyint", np.dtype(np.int8), "int8_t", None, np.iinfo(np.int8).min),  # type: ignore
     MonetdbTypeInfo(lib.monetdbe_int16_t, "smallint", np.dtype(np.int16), "int16_t", None, np.iinfo(np.int16).min),  # type: ignore
     MonetdbTypeInfo(lib.monetdbe_int32_t, "int", np.dtype(np.int32), "int32_t", None, np.iinfo(np.int32).min),  # type: ignore
     MonetdbTypeInfo(lib.monetdbe_int64_t, "bigint", np.dtype(np.int64), "int64_t", None, np.iinfo(np.int64).min),  # type: ignore
-    MonetdbTypeInfo(lib.monetdbe_size_t, None, np.dtype(np.uint64), "size_t", None, None),  # used by monetdb internally
     MonetdbTypeInfo(lib.monetdbe_float, "real", np.dtype(np.float32), "float", py_float, np.finfo(np.float32).min),
     MonetdbTypeInfo(lib.monetdbe_double, "float", np.dtype(np.float64), "double", py_float, np.finfo(np.float64).min),
+]
+
+# things that can have a mapping from numpy to monetdb but not back
+numpy_to_monetdb_type_infos: List[MonetdbTypeInfo] = [
+    MonetdbTypeInfo(lib.monetdbe_int8_t, "tinyint", np.dtype(np.uint8), "int8_t", None, None),
+    MonetdbTypeInfo(lib.monetdbe_int16_t, "smallint", np.dtype(np.uint16), "int16_t", None, None),
+    MonetdbTypeInfo(lib.monetdbe_int32_t, "int", np.dtype(np.uint32), "int32_t", None, None),
+    MonetdbTypeInfo(lib.monetdbe_int64_t, "bigint", np.dtype(np.uint64), "int64_t", None, None),
+]
+
+# things that can have a mapping from monetdb to numpy but not back
+monetdb_to_numpy_type_infos: List[MonetdbTypeInfo] = [
     MonetdbTypeInfo(lib.monetdbe_str, "string", np.dtype('=O'), "str", make_string, None),
     MonetdbTypeInfo(lib.monetdbe_blob, "blob", np.dtype('=O'), "blob", make_blob, None),
     MonetdbTypeInfo(lib.monetdbe_date, "date", np.dtype('=O'), "date", py_date, None),
     MonetdbTypeInfo(lib.monetdbe_time, "time", np.dtype('=O'), "time", py_time, None),
     MonetdbTypeInfo(lib.monetdbe_timestamp, "timestamp", np.dtype('=O'), "timestamp", py_timestamp, None),
 ]
 
-numpy_type_map: Mapping[np.dtype, MonetdbTypeInfo] = {i.numpy_type: i for i in type_infos}
-monet_c_type_map: Mapping[int, MonetdbTypeInfo] = {i.c_type: i for i in type_infos}
+numpy_type_map: Mapping[np.dtype, MonetdbTypeInfo] = {i.numpy_type: i for i in
+                                                      inversable_type_infos + numpy_to_monetdb_type_infos}
+monet_c_type_map: Mapping[int, MonetdbTypeInfo] = {i.c_type: i for i in
+                                                   inversable_type_infos + monetdb_to_numpy_type_infos}
+
+
+def precision_warning(from_: int, to: int):
+    if from_ == lib.monetdbe_int64_t and to in (lib.monetdbe_int32_t, lib.monetdbe_int16_t, lib.monetdbe_int8_t):
+        _logger.warning("appending 64-bit data to lower bit column, potential loss of precision")
+    elif from_ == lib.monetdbe_int32_t and to in (lib.monetdbe_int16_t, lib.monetdbe_int8_t):
+        _logger.warning("appending 32-bit data to lower bit column, potential loss of precision")
+    elif from_ == lib.monetdbe_int16_t and to == lib.monetdbe_int8_t:
+        _logger.warning("appending 16-bit data to 8-bit column, potential loss of precision")
+    elif from_ in (lib.monetdbe_float, lib.monetdbe_double) and \
+            to in (lib.monetdbe_int64_t, lib.monetdbe_int32_t, lib.monetdbe_int16_t, lib.monetdbe_int8_t):
+        _logger.warning("appending float values to int column")
 
 
 def numpy_monetdb_map(numpy_type: np.dtype):
-    if numpy_type.kind in ('i', 'f'):  # type: ignore
+    if numpy_type.kind == 'U':
+        # this is an odd one, the numpy type string includes the width. Also, we don't format
+        # monetdb string columns as fixed width numpy columns yet, so technically this type is
+        # non-reversable for now.
+        return MonetdbTypeInfo(lib.monetdbe_str, "string", numpy_type, "char *", None, None)
+
+    if numpy_type.kind in supported_numpy_types:  # type: ignore
         return numpy_type_map[numpy_type]
-    raise ProgrammingError("append() only support int and float family types")
+    raise ProgrammingError(f"append() called with unsupported type {numpy_type}")
 
 
-from monetdbe._cffi.branch import newer_then_jul2021
 if newer_then_jul2021:
     def extract(rcol: monetdbe_column, r: int, text_factory: Optional[Callable[[str], Any]] = None):
         """

diff --git a/monetdbe/_cffi/embed.h.j2 b/monetdbe/_cffi/embed.h.j2
@@ -150,3 +150,5 @@ extern char* monetdbe_get_columns(monetdbe_database dbhdl, const char* schema_na
 
 extern char* monetdbe_dump_database(monetdbe_database dbhdl, const char *backupfile);
 extern char* monetdbe_dump_table(monetdbe_database dbhdl, const char *schema_name, const char *table_name, const char *backupfile);
+
+extern void initialize_string_array_from_numpy(char** restrict output, size_t size, char* restrict numpy_string_input, size_t stride_length);
diff --git a/monetdbe/_cffi/internal.py b/monetdbe/_cffi/internal.py
@@ -9,10 +9,8 @@
 
 from monetdbe._lowlevel import ffi, lib
 from monetdbe import exceptions
-from monetdbe._cffi.convert import make_string, monet_c_type_map, extract, numpy_monetdb_map
-from monetdbe._cffi.convert.bind import prepare_bind
-from monetdbe._cffi.convert.bind import monetdbe_decimal_to_bte, monetdbe_decimal_to_sht, monetdbe_decimal_to_int, monetdbe_decimal_to_lng
-
+from monetdbe._cffi.convert import make_string, monet_c_type_map, extract, numpy_monetdb_map, precision_warning
+from monetdbe._cffi.convert.bind import monetdbe_decimal_to_bte, monetdbe_decimal_to_sht, monetdbe_decimal_to_int, monetdbe_decimal_to_lng, prepare_bind
 from monetdbe._cffi.errors import check_error
 from monetdbe._cffi.types_ import monetdbe_result, monetdbe_database, monetdbe_column, monetdbe_statement
 
@@ -74,15 +72,15 @@ def get_autocommit() -> bool:
 def bind(statement: monetdbe_statement, data: Any, parameter_nr: int, type_info=None) -> None:
     try:
         _type_info = type_info[parameter_nr]
-        if (_type_info.sql_type == 'decimal'):
+        if _type_info.sql_type == 'decimal':
             d = int(Decimal(data) * (Decimal(10) ** _type_info.scale))
-            if (_type_info.impl_type == 'bte'):
+            if _type_info.impl_type == 'bte':
                 prepared = monetdbe_decimal_to_bte(d)
-            elif (_type_info.impl_type == 'sht'):
+            elif _type_info.impl_type == 'sht':
                 prepared = monetdbe_decimal_to_sht(d)
-            elif (_type_info.impl_type == 'int'):
+            elif _type_info.impl_type == 'int':
                 prepared = monetdbe_decimal_to_int(d)
-            elif (_type_info.impl_type == 'lng'):
+            elif _type_info.impl_type == 'lng':
                 prepared = monetdbe_decimal_to_lng(d)
             else:
                 raise NotImplementedError("Unknown decimal implementation type")
@@ -250,6 +248,7 @@ def append(self, table: str, data: Mapping[str, np.ndarray], schema: str = 'sys'
         """
         Directly append an array structure
         """
+
         self._switch()
         n_columns = len(data)
         existing_columns = list(self.get_columns(schema=schema, table=table))
@@ -260,24 +259,47 @@ def append(self, table: str, data: Mapping[str, np.ndarray], schema: str = 'sys'
             raise exceptions.ProgrammingError(error)
 
         work_columns = ffi.new(f'monetdbe_column * [{n_columns}]')
-        work_objs = []
+        # cffi_objects assists to keep all in-memory native data structure alive during the execution of this call
+        cffi_objects = []
         for column_num, (column_name, existing_type) in enumerate(existing_columns):
             column_values = data[column_name]
             work_column = ffi.new('monetdbe_column *')
             type_info = numpy_monetdb_map(column_values.dtype)
-            if not type_info.c_type == existing_type:
-                existing_type_string = monet_c_type_map[existing_type].c_string_type
-                error = f"Type '{type_info.c_string_type}' for appended column '{column_name}' " \
-                        f"does not match table type '{existing_type_string}'"
-                raise exceptions.ProgrammingError(error)
+
+            # try to convert the values if types don't match
+            if type_info.c_type != existing_type:
+                precision_warning(type_info.c_type, existing_type)
+                to_numpy_type = monet_c_type_map[existing_type].numpy_type
+                try:
+                    column_values = column_values.astype(to_numpy_type)
+                    type_info = numpy_monetdb_map(column_values.dtype)
+                except Exception as e:
+                    existing_type_string = monet_c_type_map[existing_type].c_string_type
+                    error = f"Can't convert '{type_info.c_string_type}' " \
+                            f"to type '{existing_type_string}' for column '{column_name}': {e} "
+                    raise ValueError(error)
+
             work_column.type = type_info.c_type
             work_column.count = column_values.shape[0]
             work_column.name = ffi.new('char[]', column_name.encode())
-            work_column.data = ffi.cast(f"{type_info.c_string_type} *", ffi.from_buffer(column_values))
+            if type_info.numpy_type.kind == 'U':
+                # first massage the numpy array of unicode into a matrix of null terminated rows of bytes.
+                v = np.char.encode(column_values).view('b').reshape((work_column.count, -1))
+                v = np.c_[v, np.zeros(work_column.count, dtype=np.int8)]
+                stride_length = v.shape[1]
+                cffi_objects.append(v)
+                t = ffi.new('char*[]', work_column.count)
+                cffi_objects.append(t)
+                p = ffi.from_buffer("char*", v)
+                cffi_objects.append(p)
+                lib.initialize_string_array_from_numpy(t, work_column.count, p, stride_length)
+                work_column.data = t
+            else:
+                work_column.data = ffi.from_buffer(f"{type_info.c_string_type}*", column_values)
             work_columns[column_num] = work_column
-            work_objs.append(work_column)
-        check_error(
-            lib.monetdbe_append(self._monetdbe_database, schema.encode(), table.encode(), work_columns, n_columns))
+            cffi_objects.append(work_column)
+        check_error(lib.monetdbe_append(self._monetdbe_database, schema.encode(),
+                                        table.encode(), work_columns, n_columns))
 
     def prepare(self, query: str) -> monetdbe_statement:
         self._switch()

diff --git a/monetdbe/_cffi/native_utilities.c b/monetdbe/_cffi/native_utilities.c
@@ -0,0 +1,8 @@
+#include "monetdb/monetdbe.h"
+
+#include <stddef.h>
+void initialize_string_array_from_numpy(char** restrict output, size_t size, char* restrict numpy_string_input, size_t stride_length) {
+    for (size_t i = 0; i < size; i++) {
+        output[i] = numpy_string_input + i*stride_length;
+    }
+}
diff --git a/monetdbe/cursors.py b/monetdbe/cursors.py
@@ -6,7 +6,8 @@
 from monetdbe.connection import Connection, Description
 from monetdbe.exceptions import ProgrammingError, InterfaceError
 from monetdbe.formatting import format_query, strip_split_and_clean, parameters_type
-from monetdbe.monetize import monet_identifier_escape, convert
+from monetdbe.monetize import monet_identifier_escape
+from monetdbe.types import supported_numpy_types
 
 if TYPE_CHECKING:
     from monetdbe.row import Row
@@ -285,7 +286,7 @@ def _insert_slow(self, table: str, data: Dict[str, np.ndarray], schema: str = 's
         query = f"insert into {schema}.{table} ({columns}) values ({qmarks})"
         return self.executemany(query, rows_zipped)
 
-    def insert(self, table: str, values: Union[pd.DataFrame, Dict[str, np.ndarray]], schema: str = 'sys'):
+    def insert(self, table: str, values: Union[pd.DataFrame, Mapping[str, np.ndarray]], schema: str = 'sys'):
         """
         Inserts a set of values into the specified table.
 
@@ -303,7 +304,7 @@ def insert(self, table: str, values: Union[pd.DataFrame, Dict[str, np.ndarray]],
             if not isinstance(value, (np.ma.core.MaskedArray, np.ndarray)):  # type: ignore
                 prepared[key] = np.array(value)
 
-        if sum(i.dtype.kind not in 'if' for i in prepared.values()):  # type: ignore
+        if sum(i.dtype.kind not in supported_numpy_types for i in prepared.values()):  # type: ignore
             warn(
                 "One of the columns you are inserting is not of type int or float which fast append doesn't support. Falling back to regular insert.")
             return self._insert_slow(table, prepared, schema)

diff --git a/monetdbe/monetize.py b/monetdbe/monetize.py
@@ -77,6 +77,7 @@ def monet_datetime(data: Any) -> str:
     (datetime.date, monet_escape),
     (datetime.timedelta, monet_escape),
     (bool, monet_bool),
+    (np.bool_, monet_bool),
     (type(None), monet_none),
     (np.int64, int),
     (np.int32, int),

diff --git a/monetdbe/types.py b/monetdbe/types.py
@@ -50,3 +50,17 @@
 BINARY_LARGE_OBJECT = BLOB
 NUMERIC = DECIMAL
 DOUBLE_PRECISION = DOUBLE
+
+supported_numpy_types: str = (
+    'b'  # boolean
+    'i'  # signed integer
+    'u'  # unsigned integer
+    'f'  # floating-point
+    # 'M'  # datetime
+    'U'  # Unicode
+    # c complex floating-point
+    # m timedelta
+    # O object
+    # S (byte-)string
+    # V void
+)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,6 @@
 [build-system]
 requires = ["setuptools >= 40.6.0", "wheel", 'cffi']
 build-backend = "setuptools.build_meta"
+
+[mypy]
+plugins = "numpy.typing.mypy_plugin"
diff --git a/tests/test_cffi.py b/tests/test_cffi.py
@@ -29,34 +29,50 @@ def test_append_too_little_columns(self):
                 con._internal.append(table='test', data=data)
 
     def test_append_wrong_type(self):
+        """
+        we now convert this automatically, so should not raise error
+        """
         with connect() as con:
             con.execute("CREATE TABLE test (i int)")
             data = {'i': np.array([0.1, 0.2, 0.3], dtype=np.float32)}
-            with self.assertRaises(ProgrammingError):
-                con._internal.append(table='test', data=data)
+            con._internal.append(table='test', data=data)
 
     def test_append_wrong_size(self):
+        """
+        we now convert this automatically, so should not raise error
+        """
         with connect() as con:
+
             con.execute("CREATE TABLE test (i int)")  # SQL int is 32 bit
             data = {'i': np.array([1, 2, 3], dtype=np.int64)}
-            with self.assertRaises(ProgrammingError):
-                con._internal.append(table='test', data=data)
+            con._internal.append(table='test', data=data)
 
     def test_append_supported_types(self):
         with connect() as con:
-            con.execute("CREATE TABLE test (t tinyint, s smallint, i int, b bigint, r real, f float)")
+            con.execute("CREATE TABLE test (t tinyint, s smallint, i int, h bigint, r real, f float, b bool)")
             con.execute(
                 """
-                INSERT INTO test VALUES (2^8,  2^16,  2^32,  2^64,  0.12345,  0.123456789),
-                                        (NULL, NULL,  NULL,  NULL,  NULL,     NULL),
-                                        (0,    0,     0,     0,     0.0,      0.0),
-                                        (-2^8, -2^16, -2^32, -2^64, -0.12345, -0.123456789)
+                INSERT INTO test VALUES (2^8,  2^16,  2^32,  2^64,  0.12345,  0.123456789, true),
+                                        (NULL, NULL,  NULL,  NULL,  NULL,     NULL, NULL),
+                                        (0,    0,     0,     0,     0.0,      0.0, false),
+                                        (-2^8, -2^16, -2^32, -2^64, -0.12345, -0.123456789, false)
                 """
             )
             data = con.execute("select * from test").fetchnumpy()
             con._internal.append(schema='sys', table='test', data=data)
             con.cursor().insert(table='test', values=data)
 
+    def test_append_numpy_only_types(self):
+        """
+        test numpy types don't have have a direct 1-on-1 sql mapping
+        """
+        with connect() as con:
+            table = 'i'
+            con.execute(f"CREATE TABLE {table} (i int)")
+            data = {'i': np.ndarray([0], dtype=np.uint32)}
+            con._internal.append(schema='sys', table=table, data=data)
+            con.cursor().insert(table=table, values=data)
+
     def test_append_unsupported_types(self):
         with connect() as con:
             con.execute("CREATE TABLE test (s string, b blob, d date, t time, ts timestamp)")