diff --git a/niimpy/__init__.py b/niimpy/__init__.py index 95ce8cf6..5646f320 100644 --- a/niimpy/__init__.py +++ b/niimpy/__init__.py @@ -2,8 +2,8 @@ from niimpy.reading.database import open, Data1, ALL from niimpy.preprocessing.filter import filter_dataframe -from niimpy.reading.read import read_sqlite, read_sqlite_tables -from niimpy.reading.read import read_csv, read_csv_string +from niimpy.reading.sqlite import read_sqlite, read_sqlite_tables +from niimpy.reading.csv import read_csv, read_csv_string from niimpy.preprocessing import sampledata from niimpy.preprocessing import util diff --git a/niimpy/preprocessing/util.py b/niimpy/preprocessing/util.py index 0dd8497f..c5445ed5 100644 --- a/niimpy/preprocessing/util.py +++ b/niimpy/preprocessing/util.py @@ -89,7 +89,42 @@ def unlink_if_exists(x): unlink_if_exists(SQLITE3_EXTENSIONS_FILENAME) -#TODO: reanme to data.py +def read_preprocess(df, add_group=None): + """Standard preprocessing arguments when reading. + + This is a preprocessing filter which handles some standard arguments + when reading files. This should be considered a private, unstable + function. + + + Parameters + ---------- + + df: pandas.DataFrame + + Input data frame + + add_group: string, optional + + If given, add a new 'group' column with all values set to this + given identifier. + + + Returns + ------- + + df: dataframe + + Resulting dataframe (modified in-place if possible, but may also + be a copy) + + """ + if add_group is not None: + df['group'] = add_group + #df['group'] = df['group'].astype('category') + #pd.Categorical(add_group) + return df + def df_normalize(df, tz=None, old_tz=None): """Normalize a df (from sql) before presenting it to the user. diff --git a/niimpy/reading/__init__.py b/niimpy/reading/__init__.py index dab96776..6df96d24 100644 --- a/niimpy/reading/__init__.py +++ b/niimpy/reading/__init__.py @@ -1,3 +1,4 @@ from . import mhealth -from . import read +from . import csv from . import google_takeout +from . import sqlite diff --git a/niimpy/reading/csv.py b/niimpy/reading/csv.py new file mode 100644 index 00000000..0897535f --- /dev/null +++ b/niimpy/reading/csv.py @@ -0,0 +1,78 @@ +"""Read data from a CSV file + +""" + +import pandas as pd +import warnings + +from niimpy.preprocessing import util + + +def read_csv(filename, read_csv_options={}, add_group=None, + tz=None): + """Read DataFrame from csv file + + This will read data from a csv file and then process the result with + `niimpy.util.df_normalize`. + + + Parameters + ---------- + + filename : str + filename of csv file + + read_csv_options: dict + Dictionary of options to pandas.read_csv, if this is necessary for custom + csv files. + + add_group : object + If given, add a 'group' column with all values set to this. + + """ + if tz is None: + warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2) + + df = pd.read_csv(filename, **read_csv_options) + + # df_normalize converts sets the index to time values and does other time + # conversions. Inplace. + util.df_normalize(df, tz=tz) + df = util.read_preprocess(df, add_group=add_group) + return df + + +def read_csv_string(string, tz=None): + """Parse a string containing CSV and return dataframe + + This should not be used for serious reading of CSV from disk, but + can be useful for tests and examples. Various CSV reading options + are turned on in order to be better for examples: + + - Allow comments in the CSV file + + - Remove the `datetime` column (redundant with `index` but some + older functions break without it, so default readers need to leave + it). + + Parameters + ---------- + string : string containing CSV file + + + Returns + ------- + df: pandas.DataFrame + """ + if tz is None: + warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2) + import io + df = read_csv(io.StringIO(string), + tz=tz, + read_csv_options={ + 'comment': '#', + }, + ) + if 'datetime' in df.columns: + del df['datetime'] + return df diff --git a/niimpy/reading/read.py b/niimpy/reading/read.py deleted file mode 100644 index 3ad78209..00000000 --- a/niimpy/reading/read.py +++ /dev/null @@ -1,212 +0,0 @@ -"""Read data from various formats, user entery point. - -This module contains various functions `read_*` which load data from different -formats into pandas.DataFrame:s. As a side effect, it provides the -authoritative information on how incoming data is converted to dataframes. - -""" - -import pandas as pd -import warnings -import json - -from niimpy.reading import database -from niimpy.preprocessing import util - -def _read_preprocess(df, add_group=None): - """Standard preprocessing arguments when reading. - - This is a preprocessing filter which handles some standard arguments - when reading files. This should be considered a private, unstable - function. - - - Parameters - ---------- - - df: pandas.DataFrame - - Input data frame - - add_group: string, optional - - If given, add a new 'group' column with all values set to this - given identifier. - - - Returns - ------- - - df: dataframe - - Resulting dataframe (modified in-place if possible, but may also - be a copy) - - """ - if add_group is not None: - df['group'] = add_group - #df['group'] = df['group'].astype('category') - #pd.Categorical(add_group) - return df - - -def read_sqlite(filename, table, add_group=None, user=database.ALL, limit=None, offset=None, start=None, end=None, tz=None): - """Read DataFrame from sqlite3 database - - This will read data from a sqlite3 file, taking sensor data in a - given table, and optionally apply various limits. - - Parameters - ---------- - - filename : str - filename of sqlite3 database - - table : str - table name of data within the database - - add_group : object - If given, add a 'group' column with all values set to this. - - user : str or database.ALL, optional - If given, return only data matching this user (based an column 'user') - - limit : int, optional - If given, return only this many rows - - offset : int, optional - When used with limit, skip this many lines at the beginning - - start : int or float or str or datetime.datetime, optional - If given, limit to this starting time. Formats can be int/float - (unixtime), string (parsed with dateutil.parser.parser, or - datetime.datetime. - - end : int or float or str or datetime.datetime, optional - Same meaning as 'start', but for end time - """ - if tz is None: - warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2) - - db = database.Data1(filename, tz=tz) - df = db.raw(table, user, limit=limit, offset=offset, start=start, end=end) - df = _read_preprocess(df, add_group=add_group) - return df - - -def read_sqlite_tables(filename): - """Return names of all tables in this database - - Return a set of all tables contained in this database. This may be - useful when you need to see what data is available within a database. - """ - db = database.Data1(filename) - return db.tables() - -def _get_dataframe(df_or_database, table, user=None): - """Read from database or directly use DataFrame - - Functions used to accept a database only, now the standard is - dataframe. This provides some backwards compatability between the - old and new systems: DataFrames are used as-is, but if a database is - given, it extracts the right information out of the table (and does - what the database used to do to filter by user). This function - could also be used to transparently accept other types of data - inputs. - - If input is: - - - atabase: extract the given table/user using .raw() and return - - A typical usage is:: - - def function(df): - # 'df' could be a DataFrame or database - df = _get_dataframe(df, 'TableName') - # 'df' is now always a DataFrame - - Returns - ------- - df : DataFrame (same one if possible) - - """ - if isinstance(df_or_database, database.Data1): - df = df_or_database.raw(table=table, user=subject) - else: - df = df_or_database - # questions was *not* dataframe. - if user is not None and user is not database.ALL: - df = df[df['user'] == user] - return df - - - - - -def read_csv(filename, read_csv_options={}, add_group=None, - tz=None): - """Read DataFrame from csv file - - This will read data from a csv file and then process the result with - `niimpy.util.df_normalize`. - - - Parameters - ---------- - - filename : str - filename of csv file - - read_csv_options: dict - Dictionary of options to pandas.read_csv, if this is necessary for custom - csv files. - - add_group : object - If given, add a 'group' column with all values set to this. - - """ - if tz is None: - warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2) - - df = pd.read_csv(filename, **read_csv_options) - - # df_normalize converts sets the index to time values and does other time - # conversions. Inplace. - util.df_normalize(df, tz=tz) - df = _read_preprocess(df, add_group=add_group) - return df - -def read_csv_string(string, tz=None): - """Parse a string containing CSV and return dataframe - - This should not be used for serious reading of CSV from disk, but - can be useful for tests and examples. Various CSV reading options - are turned on in order to be better for examples: - - - Allow comments in the CSV file - - - Remove the `datetime` column (redundant with `index` but some - older functions break without it, so default readers need to leave - it). - - Parameters - ---------- - string : string containing CSV file - - - Returns - ------- - df: pandas.DataFrame - """ - if tz is None: - warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2) - import io - df = read_csv(io.StringIO(string), - tz=tz, - read_csv_options={ - 'comment': '#', - }, - ) - if 'datetime' in df.columns: - del df['datetime'] - return df diff --git a/niimpy/reading/sqlite.py b/niimpy/reading/sqlite.py new file mode 100644 index 00000000..9892a4fc --- /dev/null +++ b/niimpy/reading/sqlite.py @@ -0,0 +1,97 @@ +""" Read data from sqlite3 database. +""" + +import warnings + +from niimpy.reading import database +from niimpy.preprocessing import util + + +def read_sqlite(filename, table, add_group=None, user=database.ALL, limit=None, offset=None, start=None, end=None, tz=None): + """Read DataFrame from sqlite3 database + + This will read data from a sqlite3 file, taking sensor data in a + given table, and optionally apply various limits. + + Parameters + ---------- + + filename : str + filename of sqlite3 database + + table : str + table name of data within the database + + add_group : object + If given, add a 'group' column with all values set to this. + + user : str or database.ALL, optional + If given, return only data matching this user (based an column 'user') + + limit : int, optional + If given, return only this many rows + + offset : int, optional + When used with limit, skip this many lines at the beginning + + start : int or float or str or datetime.datetime, optional + If given, limit to this starting time. Formats can be int/float + (unixtime), string (parsed with dateutil.parser.parser, or + datetime.datetime. + + end : int or float or str or datetime.datetime, optional + Same meaning as 'start', but for end time + """ + if tz is None: + warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2) + + db = database.Data1(filename, tz=tz) + df = db.raw(table, user, limit=limit, offset=offset, start=start, end=end) + df = util.read_preprocess(df, add_group=add_group) + return df + + +def read_sqlite_tables(filename): + """Return names of all tables in this database + + Return a set of all tables contained in this database. This may be + useful when you need to see what data is available within a database. + """ + db = database.Data1(filename) + return db.tables() + +def _get_dataframe(df_or_database, table, user=None): + """Read from database or directly use DataFrame + + Functions used to accept a database only, now the standard is + dataframe. This provides some backwards compatability between the + old and new systems: DataFrames are used as-is, but if a database is + given, it extracts the right information out of the table (and does + what the database used to do to filter by user). This function + could also be used to transparently accept other types of data + inputs. + + If input is: + + - database: extract the given table/user using .raw() and return + + A typical usage is:: + + def function(df): + # 'df' could be a DataFrame or database + df = _get_dataframe(df, 'TableName') + # 'df' is now always a DataFrame + + Returns + ------- + df : DataFrame (same one if possible) + + """ + if isinstance(df_or_database, database.Data1): + df = df_or_database.raw(table=table, user=subject) + else: + df = df_or_database + # questions was *not* dataframe. + if user is not None and user is not database.ALL: + df = df[df['user'] == user] + return df diff --git a/tests/preprocessing/test_sampledata.py b/tests/preprocessing/test_sampledata.py index cb15f3e7..4d4b563b 100644 --- a/tests/preprocessing/test_sampledata.py +++ b/tests/preprocessing/test_sampledata.py @@ -6,7 +6,7 @@ from niimpy import config import niimpy -from niimpy.reading import read +from niimpy.reading import csv from niimpy.preprocessing import sampledata TZ = 'Europe/Helsinki' diff --git a/tests/reading/test_read.py b/tests/reading/test_read.py index ebf6bb48..07f33810 100644 --- a/tests/reading/test_read.py +++ b/tests/reading/test_read.py @@ -1,7 +1,7 @@ import pandas as pd import niimpy -from niimpy.reading import read +from niimpy.preprocessing import util from niimpy import config TZ = 'Europe/Helsinki' @@ -9,7 +9,7 @@ def test_read_preprocess_add_group(): """Test of add_group= option""" data = pd.DataFrame({'user': ['u1', 'u2', 'u3'], 'a': [1,2,3], 'b': [4,5,6]}) - data2 = read._read_preprocess(data, add_group='group1') + data2 = util.read_preprocess(data, add_group='group1') assert 'group' in data2 assert (data2['group'] == 'group1').all() diff --git a/tests/reading/test_read_csv.py b/tests/reading/test_read_csv.py index 8472c3ce..ee1b2d5e 100644 --- a/tests/reading/test_read_csv.py +++ b/tests/reading/test_read_csv.py @@ -2,7 +2,7 @@ import numpy as np import niimpy -from niimpy.reading import read +from niimpy.reading import csv from niimpy.preprocessing import sampledata from niimpy import config diff --git a/tests/reading/test_read_sqlite.py b/tests/reading/test_read_sqlite.py index e8ec4d7c..41dc08a5 100644 --- a/tests/reading/test_read_sqlite.py +++ b/tests/reading/test_read_sqlite.py @@ -1,5 +1,5 @@ import niimpy -from niimpy.reading import read +from niimpy.reading import csv from niimpy.preprocessing import sampledata TZ = 'Europe/Helsinki'