From 76171a41df8796638003e9cb6292170db5233ce2 Mon Sep 17 00:00:00 2001
From: Matt Hall <matt@agilescientific.com>
Date: Sat, 23 Apr 2022 11:55:28 -0400
Subject: [PATCH] Revert "IO changes (#134)" (#152)

This reverts commit 58ca0218efb40ff60d99858f9d5877ed5be7d2a9.
---
 striplog/striplog.py   | 211 +++++++++++++++--------------------------
 tests/test_plots.py    |   3 +-
 tests/test_striplog.py |   4 +-
 3 files changed, 78 insertions(+), 140 deletions(-)

diff --git a/striplog/striplog.py b/striplog/striplog.py
index 28a6930..de29f66 100644
--- a/striplog/striplog.py
+++ b/striplog/striplog.py
@@ -597,157 +597,98 @@ def _build_list_of_Intervals(cls,
         return list_of_Intervals
 
     @classmethod
-    def from_csv(cls, filename,
+    def from_csv(cls, filename=None,
                  text=None,
-                 order='depth',
-                 usecols=None, # which columns to read, with 0 being the first
-                 source=None, # string - where you got the data from
                  dlm=',',
-                 delimiter=',',
-                 skip_header=0,
-                 names=None, # works the same way as np.genfromtxt
-                 fieldnames=None,
-                 points=False,
                  lexicon=None,
+                 points=False,
                  include=None,
                  exclude=None,
-                 ignore=None,
-                 stop=None,
+                 remap=None,
                  function=None,
                  null=None,
-                 remap=None,
-                 **kwargs
-        ):
-        """
-        Read a csv file and generate a Striplog.
-        There are a number of cases that should be handled:
-            * Only tops are given - bases are inferred to be the next top.
-            * Only bases are given - tops are inferred to be the next base.
-            * Both bases and tops are given
-            * Either bases or tops are given along with a thickness -
-                the missing value is calculated using the thickness.
-
-            TODO: handle the missing cases when creating the list of intervals.
-                Currently only the first and third cases will work.
-
-        The easiest is to have the headers in the csv, and then use the 
-            `name` argument. If you do this, then the `top`, `base` and
-            `thickness` columns will be inferred if they are named that.
-        Alternatively, the columns to use can be supplied as a tuple of ints 
-            using `usecols`.
-
-        The actual CSV reading is done using numpy's genfromtxt, so all its
-            arguments can be passed as kwargs. For details, see
-            https://numpy.org/devdocs/reference/generated/numpy.genfromtxt.html
+                 ignore=None,
+                 source=None,
+                 stop=None,
+                 fieldnames=None):
+        """
+        Load from a CSV file or text.
 
-        Args:
-            filename: Filename, a string, a list of strings, a generator 
-                or an open file-like object with a read method, for example,
-                a file or io.StringIO object. If a single string is provided,
-                it is assumed to be the name of a local or remote file. If a
-                list of strings or a generator returning strings is provided,
-                each string is treated as one line in a file. When the URL of
-                a remote file is passed, the file is automatically downloaded
-                to the current directory and opened. 
-            text (str, optional): Deprecated.
-            order (str, optional): Controls the direction of the striplogs.
-                Accepted values are 'depth' or 'elevation'. This is most important
-                when using a thickness, defaults to 'depth'.
-            usecols (int, list of ints, optional): Which columns to read. This
-                should be linked with the use of `names` in many cases.
-            source (string, optional): The attribution or source of the file.
-            dlm (string, optional): Delimiter character between data entries in row of
-                file. Deprecated. Please use 'delimiter' instead. Defaults to ','.
-            delimiter (string, optional): The string used to separate values. 
-                By default, a single comma (,) acts as delimiter. An integer or
-                sequence of integers can also be provided as width(s) of each field. 
-            skip_header (int, optional): Number of rows to skip. Defaults to 0.
-            names (optional): If True, uses first unskipped row to define column
-                headers. Definitely the easiest and most reliable approach.
-                If a list, uses the list for column names. Defaults to None.
-            fieldnames (str, sequence of strings, optional): [description].
-                Deprecated. See `names`. Defaults to None.
-            points ([], optional) [description]. Defaults to None.
-            lexicon ([type], optional): [description]. Defaults to None.
-            include ([type], optional): [description]. Defaults to None.
-            exclude ([type], optional): [description]. Defaults to None.
-            ignore ([type], optional): [description]. Defaults to None.
-            stop ([type], optional): [description]. Defaults to None.
-            function (function, optional): Not currently used here, but will be.
-                Defaults to None.
-            null (string, optional): Deprecated. Defaults to None.
-            remap (string, optional): Deprecated. Defaults to None.
+        Args
+            filename (str): The filename, or use `text`.
+            text (str): CSV data as a string, or use `filename`.
+            dlm (str): The delimiter, default ','.
+            lexicon (Lexicon): The lexicon to use, optional. Only needed if \
+                parsing descriptions (e.g. cuttings).
+            points (bool): Whether to make a point dataset (as opposed to \
+                ordinary intervals with top and base. Default is False.
+            include: Default is None.
+            exclude: Default is None.
+            remap: Default is None.
+            function: Default is None.
+            null: Default is None.
+            ignore: Default is None.
+            source: Default is None.
+            stop: Default is None.
+            fieldnames: Default is None.
 
-        Returns:
-            Striplog: A Striplog, made of the intervals as defined in the CSV file.
-        """
-
-        # Some deprecation warnings to start with.
-        # This should only be here for a couple of releases.
-        removed_by = 'This arg will be removed in version 0.9.1'
-        if (dlm != ','):
-            delimiter = dlm
-            w = f"'dlm' is deprecated; please use 'delimiter'. {removed_by}"
-            warnings.warn(w, FutureWarning, stacklevel=2)
-
-        if (null != None):
-            w1 = "'null' is deprecated; please use 'fill_values' or 'missing_values'."
-            w2 = f"See numpy.genfromtxt for how these work."
-            warnings.warn(w1+w2, FutureWarning, stacklevel=2)
-
-        if (fieldnames != None):
-            names = fieldnames
-            w = f"'fieldnames' is deprecated; please use 'names'. {removed_by}"
-            warnings.warn(w, FutureWarning, stacklevel=2)
-
-        if (remap != None):
-            w = f"'remap' is deprecated and no longer being used. {removed_by}"
-            warnings.warn(w, FutureWarning, stacklevel=2)
-
-        if (text != None):
-            w = f"'text' is deprecated and no longer being used. {removed_by}"
-            warnings.warn(w, FutureWarning, stacklevel=2)
-
-        data = np.genfromtxt(filename, delimiter=delimiter, usecols=usecols,
-                             names=names, skip_header=skip_header,
-                             **kwargs)
-
-        data_dict = {}
-        if names:
-            print('checking names')
-            for name in data.dtype.names:
-                # We expect `top` and `base`, but we might get `tops` and `bases`.
-                if name == 'tops': # There might be other cases worth checking?
-                    data_dict.update({'top': data[name]})
-                if name == 'bases': # There might be other cases worth checking?
-                    data_dict.update({'base': data[name]})
-                else: # Everything else is handled here, which is quite nice.
-                    data_dict.update({name: data[name]})
-
-        if not names: # It would be nice to not do something like this.
-                      # by forcing users to pass names with something?
-            if top != None:
-                data_dict.update({'top': data[:, top]})
-            if base != None:
-                data_dict.update({'base': data[:, base]})
-            if thickness != None:
-                data_dict.update({'thickness': data[:, thickness]})
-            if content:
+        Returns
+            Striplog. A new instance.
+        """
+        if (filename is None) and (text is None):
+            raise StriplogError("You must provide a filename or CSV text.")
+
+        if (filename is not None):
+            if source is None:
+                source = filename
+            with open(filename, 'r') as f:
+                text = f.read()
+
+        source = source or 'CSV'
+
+        # Deal with multiple spaces in space delimited file.
+        if dlm == ' ':
+            text = re.sub(r'[ \t]+', ' ', text)
+
+        if fieldnames is not None:
+            text = dlm.join(fieldnames) + '\n' + text
+
+        try:
+            f = StringIO(text)  # Python 3
+        except TypeError:
+            f = StringIO(unicode(text))  # Python 2
+
+        reader = csv.DictReader(f, delimiter=dlm)
+
+        # Reorganize the data to make fixing it easier.
+        reorg = {k.strip().lower(): []
+                 for k in reader.fieldnames
+                 if k is not None}
+        t = f.tell()
+        for key in reorg:
+            f.seek(t)
+            for r in reader:
+                s = {k.strip().lower(): v.strip() for k, v in r.items()}
                 try:
-                    for ii in content:
-                        data_dict.update({f'content_{ii}': data[:, ii]})
-                except TypeError:
-                    data_dict.update({f'content_{content}': data[:, content]})
+                    reorg[key].append(float(s[key]))
+                except ValueError:
+                    reorg[key].append(s[key])
+
+        f.close()
+
+        remap = remap or {}
+        for k, v in remap.items():
+            reorg[v] = reorg.pop(k)
 
-        # Now we make the intervals.
-        list_of_Intervals = cls._build_list_of_Intervals(data_dict,
+        data = cls._clean_longitudinal_data(reorg, null=null)
+
+        list_of_Intervals = cls._build_list_of_Intervals(data,
                                                          points=points,
                                                          lexicon=lexicon,
                                                          include=include,
                                                          exclude=exclude,
                                                          ignore=ignore,
-                                                         stop=stop
-                                                         )
+                                                         stop=stop)
 
         return cls(list_of_Intervals, source=source)
 
diff --git a/tests/test_plots.py b/tests/test_plots.py
index 5c1712a..8969971 100644
--- a/tests/test_plots.py
+++ b/tests/test_plots.py
@@ -9,7 +9,6 @@
 https://pypi.python.org/pypi/pytest-mpl/0.3
 """
 import random
-import io
 
 import matplotlib.pyplot as plt
 import pytest
@@ -112,7 +111,7 @@ def test_striplog_top_plot():
                    20, Sobrarbe Fm.
                    50, Cretaceous"""
 
-    tops = Striplog.from_csv(io.StringIO(tops_csv), names=True)
+    tops = Striplog.from_csv(text=tops_csv)
 
     fig = tops.plot(style='tops',
                     field='formation',
diff --git a/tests/test_striplog.py b/tests/test_striplog.py
index 424ea7c..391abd7 100644
--- a/tests/test_striplog.py
+++ b/tests/test_striplog.py
@@ -3,7 +3,6 @@
 """
 import numpy as np
 import pytest
-import io
 
 from striplog import Component
 from striplog import Interval
@@ -252,8 +251,7 @@ def test_from_descriptions():
 def test_points():
     """Test a striplog of points.
     """
-    points = Striplog.from_csv(filename=io.StringIO(csv_points),
-        points=True, names=True)
+    points = Striplog.from_csv(text=csv_points, points=True)
     assert len(points) == 6
     assert points.order == 'none'