diff --git a/striplog/striplog.py b/striplog/striplog.py index f77f189..b3c18f4 100644 --- a/striplog/striplog.py +++ b/striplog/striplog.py @@ -583,98 +583,157 @@ def _build_list_of_Intervals(cls, return list_of_Intervals @classmethod - def from_csv(cls, filename=None, + def from_csv(cls, filename, text=None, + order='depth', + usecols=None, # which columns to read, with 0 being the first + source=None, # string - where you got the data from dlm=',', - lexicon=None, + delimiter=',', + skip_header=0, + names=None, # works the same way as np.genfromtxt + fieldnames=None, points=False, + lexicon=None, include=None, exclude=None, - remap=None, - function=None, - null=None, ignore=None, - source=None, stop=None, - fieldnames=None): - """ - Load from a CSV file or text. - - Args - filename (str): The filename, or use `text`. - text (str): CSV data as a string, or use `filename`. - dlm (str): The delimiter, default ','. - lexicon (Lexicon): The lexicon to use, optional. Only needed if \ - parsing descriptions (e.g. cuttings). - points (bool): Whether to make a point dataset (as opposed to \ - ordinary intervals with top and base. Default is False. - include: Default is None. - exclude: Default is None. - remap: Default is None. - function: Default is None. - null: Default is None. - ignore: Default is None. - source: Default is None. - stop: Default is None. - fieldnames: Default is None. - - Returns - Striplog. A new instance. - """ - if (filename is None) and (text is None): - raise StriplogError("You must provide a filename or CSV text.") - - if (filename is not None): - if source is None: - source = filename - with open(filename, 'r') as f: - text = f.read() - - source = source or 'CSV' - - # Deal with multiple spaces in space delimited file. - if dlm == ' ': - text = re.sub(r'[ \t]+', ' ', text) - - if fieldnames is not None: - text = dlm.join(fieldnames) + '\n' + text + function=None, + null=None, + remap=None, + **kwargs + ): + """ + Read a csv file and generate a Striplog. + There are a number of cases that should be handled: + * Only tops are given - bases are inferred to be the next top. + * Only bases are given - tops are inferred to be the next base. + * Both bases and tops are given + * Either bases or tops are given along with a thickness - + the missing value is calculated using the thickness. + + TODO: handle the missing cases when creating the list of intervals. + Currently only the first and third cases will work. + + The easiest is to have the headers in the csv, and then use the + `name` argument. If you do this, then the `top`, `base` and + `thickness` columns will be inferred if they are named that. + Alternatively, the columns to use can be supplied as a tuple of ints + using `usecols`. + + The actual CSV reading is done using numpy's genfromtxt, so all its + arguments can be passed as kwargs. For details, see + https://numpy.org/devdocs/reference/generated/numpy.genfromtxt.html - try: - f = StringIO(text) # Python 3 - except TypeError: - f = StringIO(unicode(text)) # Python 2 + Args: + filename: Filename, a string, a list of strings, a generator + or an open file-like object with a read method, for example, + a file or io.StringIO object. If a single string is provided, + it is assumed to be the name of a local or remote file. If a + list of strings or a generator returning strings is provided, + each string is treated as one line in a file. When the URL of + a remote file is passed, the file is automatically downloaded + to the current directory and opened. + text (str, optional): Deprecated. + order (str, optional): Controls the direction of the striplogs. + Accepted values are 'depth' or 'elevation'. This is most important + when using a thickness, defaults to 'depth'. + usecols (int, list of ints, optional): Which columns to read. This + should be linked with the use of `names` in many cases. + source (string, optional): The attribution or source of the file. + dlm (string, optional): Delimiter character between data entries in row of + file. Deprecated. Please use 'delimiter' instead. Defaults to ','. + delimiter (string, optional): The string used to separate values. + By default, a single comma (,) acts as delimiter. An integer or + sequence of integers can also be provided as width(s) of each field. + skip_header (int, optional): Number of rows to skip. Defaults to 0. + names (optional): If True, uses first unskipped row to define column + headers. Definitely the easiest and most reliable approach. + If a list, uses the list for column names. Defaults to None. + fieldnames (str, sequence of strings, optional): [description]. + Deprecated. See `names`. Defaults to None. + points ([], optional) [description]. Defaults to None. + lexicon ([type], optional): [description]. Defaults to None. + include ([type], optional): [description]. Defaults to None. + exclude ([type], optional): [description]. Defaults to None. + ignore ([type], optional): [description]. Defaults to None. + stop ([type], optional): [description]. Defaults to None. + function (function, optional): Not currently used here, but will be. + Defaults to None. + null (string, optional): Deprecated. Defaults to None. + remap (string, optional): Deprecated. Defaults to None. - reader = csv.DictReader(f, delimiter=dlm) - - # Reorganize the data to make fixing it easier. - reorg = {k.strip().lower(): [] - for k in reader.fieldnames - if k is not None} - t = f.tell() - for key in reorg: - f.seek(t) - for r in reader: - s = {k.strip().lower(): v.strip() for k, v in r.items()} + Returns: + Striplog: A Striplog, made of the intervals as defined in the CSV file. + """ + + # Some deprecation warnings to start with. + # This should only be here for a couple of releases. + removed_by = 'This arg will be removed in version 0.9.1' + if (dlm != ','): + delimiter = dlm + w = f"'dlm' is deprecated; please use 'delimiter'. {removed_by}" + warnings.warn(w, FutureWarning, stacklevel=2) + + if (null != None): + w1 = "'null' is deprecated; please use 'fill_values' or 'missing_values'." + w2 = f"See numpy.genfromtxt for how these work." + warnings.warn(w1+w2, FutureWarning, stacklevel=2) + + if (fieldnames != None): + names = fieldnames + w = f"'fieldnames' is deprecated; please use 'names'. {removed_by}" + warnings.warn(w, FutureWarning, stacklevel=2) + + if (remap != None): + w = f"'remap' is deprecated and no longer being used. {removed_by}" + warnings.warn(w, FutureWarning, stacklevel=2) + + if (text != None): + w = f"'text' is deprecated and no longer being used. {removed_by}" + warnings.warn(w, FutureWarning, stacklevel=2) + + data = np.genfromtxt(filename, delimiter=delimiter, usecols=usecols, + names=names, skip_header=skip_header, + **kwargs) + + data_dict = {} + if names: + print('checking names') + for name in data.dtype.names: + # We expect `top` and `base`, but we might get `tops` and `bases`. + if name == 'tops': # There might be other cases worth checking? + data_dict.update({'top': data[name]}) + if name == 'bases': # There might be other cases worth checking? + data_dict.update({'base': data[name]}) + else: # Everything else is handled here, which is quite nice. + data_dict.update({name: data[name]}) + + if not names: # It would be nice to not do something like this. + # by forcing users to pass names with something? + if top != None: + data_dict.update({'top': data[:, top]}) + if base != None: + data_dict.update({'base': data[:, base]}) + if thickness != None: + data_dict.update({'thickness': data[:, thickness]}) + if content: try: - reorg[key].append(float(s[key])) - except ValueError: - reorg[key].append(s[key]) - - f.close() - - remap = remap or {} - for k, v in remap.items(): - reorg[v] = reorg.pop(k) + for ii in content: + data_dict.update({f'content_{ii}': data[:, ii]}) + except TypeError: + data_dict.update({f'content_{content}': data[:, content]}) - data = cls._clean_longitudinal_data(reorg, null=null) - - list_of_Intervals = cls._build_list_of_Intervals(data, + # Now we make the intervals. + list_of_Intervals = cls._build_list_of_Intervals(data_dict, points=points, lexicon=lexicon, include=include, exclude=exclude, ignore=ignore, - stop=stop) + stop=stop + ) return cls(list_of_Intervals, source=source) diff --git a/tests/test_plots.py b/tests/test_plots.py index 2ca3188..5cf933f 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -9,6 +9,7 @@ https://pypi.python.org/pypi/pytest-mpl/0.3 """ import random +import io import matplotlib.pyplot as plt import pytest @@ -111,7 +112,7 @@ def test_striplog_top_plot(): 20, Sobrarbe Fm. 50, Cretaceous""" - tops = Striplog.from_csv(text=tops_csv) + tops = Striplog.from_csv(io.StringIO(tops_csv), names=True) fig = tops.plot(style='tops', field='formation', diff --git a/tests/test_striplog.py b/tests/test_striplog.py index a4f4947..413a41d 100644 --- a/tests/test_striplog.py +++ b/tests/test_striplog.py @@ -3,6 +3,7 @@ """ import numpy as np import pytest +import io from striplog import Component from striplog import Interval @@ -238,7 +239,8 @@ def test_from_descriptions(): def test_points(): """Test a striplog of points. """ - points = Striplog.from_csv(text=csv_points, points=True) + points = Striplog.from_csv(filename=io.StringIO(csv_points), + points=True, names=True) assert len(points) == 6 assert points.order == 'none'