Move reading.read.read_csv and reading.read.read_sqlite to their own …

…files
digitraceslab · Jul 9, 2024 · 125171e · 125171e
1 parent 3d9191a
commit 125171e
Show file tree

Hide file tree

Showing 10 changed files with 220 additions and 221 deletions.
diff --git a/niimpy/__init__.py b/niimpy/__init__.py
@@ -2,8 +2,8 @@
 
 from niimpy.reading.database import open, Data1, ALL
 from niimpy.preprocessing.filter import filter_dataframe
-from niimpy.reading.read import read_sqlite, read_sqlite_tables
-from niimpy.reading.read import read_csv, read_csv_string
+from niimpy.reading.sqlite import read_sqlite, read_sqlite_tables
+from niimpy.reading.csv import read_csv, read_csv_string
 from niimpy.preprocessing import sampledata
 from niimpy.preprocessing import util
 

diff --git a/niimpy/preprocessing/util.py b/niimpy/preprocessing/util.py
@@ -89,7 +89,42 @@ def unlink_if_exists(x):
     unlink_if_exists(SQLITE3_EXTENSIONS_FILENAME)
 
 
-#TODO: reanme to data.py
+def read_preprocess(df, add_group=None):
+    """Standard preprocessing arguments when reading.
+
+    This is a preprocessing filter which handles some standard arguments
+    when reading files.  This should be considered a private, unstable
+    function.
+
+
+    Parameters
+    ----------
+
+    df: pandas.DataFrame
+
+        Input data frame
+
+    add_group: string, optional
+
+        If given, add a new 'group' column with all values set to this
+        given identifier.
+
+
+    Returns
+    -------
+
+    df: dataframe
+
+        Resulting dataframe (modified in-place if possible, but may also
+        be a copy)
+
+    """
+    if add_group is not None:
+        df['group'] = add_group
+        #df['group'] = df['group'].astype('category')
+        #pd.Categorical(add_group)
+    return df
+
 
 def df_normalize(df, tz=None, old_tz=None):
     """Normalize a df (from sql) before presenting it to the user.

diff --git a/niimpy/reading/__init__.py b/niimpy/reading/__init__.py
@@ -1,3 +1,4 @@
 from . import mhealth
-from . import read
+from . import csv
 from . import google_takeout
+from . import sqlite
diff --git a/niimpy/reading/csv.py b/niimpy/reading/csv.py
@@ -0,0 +1,78 @@
+"""Read data from a CSV file
+
+"""
+
+import pandas as pd
+import warnings
+
+from niimpy.preprocessing import util
+
+
+def read_csv(filename, read_csv_options={}, add_group=None,
+             tz=None):
+    """Read DataFrame from csv file
+
+    This will read data from a csv file and then process the result with
+    `niimpy.util.df_normalize`.
+
+
+    Parameters
+    ----------
+
+    filename : str
+        filename of csv file
+
+    read_csv_options: dict
+        Dictionary of options to pandas.read_csv, if this is necessary for custom
+        csv files.
+
+    add_group : object
+        If given, add a 'group' column with all values set to this.
+
+    """
+    if tz is None:
+        warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2)
+
+    df = pd.read_csv(filename, **read_csv_options)
+
+    # df_normalize converts sets the index to time values and does other time
+    # conversions.  Inplace.
+    util.df_normalize(df, tz=tz)
+    df = util.read_preprocess(df, add_group=add_group)
+    return df
+
+
+def read_csv_string(string, tz=None):
+    """Parse a string containing CSV and return dataframe
+
+    This should not be used for serious reading of CSV from disk, but
+    can be useful for tests and examples.  Various CSV reading options
+    are turned on in order to be better for examples:
+
+    - Allow comments in the CSV file
+
+    - Remove the `datetime` column (redundant with `index` but some
+      older functions break without it, so default readers need to leave
+      it).
+
+    Parameters
+    ----------
+    string : string containing CSV file
+
+
+    Returns
+    -------
+    df: pandas.DataFrame
+    """
+    if tz is None:
+        warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2)
+    import io
+    df = read_csv(io.StringIO(string),
+                  tz=tz,
+                  read_csv_options={
+                      'comment': '#',
+                      },
+                 )
+    if 'datetime' in df.columns:
+        del df['datetime']
+    return df
diff --git a/niimpy/reading/read.py b/niimpy/reading/read.py