From 98af16741b269d986808b4f6956b0b8b173fd0f2 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Tue, 26 Nov 2024 13:29:02 -0600 Subject: [PATCH 01/11] update changelog --- CHANGELOG.md | 4 ++++ earthmover/nodes/source.py | 21 +++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c61df96..00a8dec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### Unreleased changes + +* feature: Allow a `colspec_file` config with column info for `fixedwidth` inputs + ### v0.4.1
Released 2024-11-15 diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index e81d5e0..ca73f60 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -95,7 +95,7 @@ class FileSource(Source): is_remote: bool = False allowed_configs: Tuple[str] = ( 'debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields', - 'file', 'type', 'columns', 'header_rows', 'colspecs', 'rename_cols', + 'file', 'type', 'columns', 'header_rows', 'colspecs', 'colspec_file', 'rename_cols', 'encoding', 'sheet', 'object_type', 'match', 'orientation', 'xpath', ) @@ -252,8 +252,7 @@ def _get_filetype(file: str): ext = file.lower().rsplit('.', 1)[-1] return ext_mapping.get(ext) - @staticmethod - def _get_read_lambda(file_type: str, sep: Optional[str] = None): + def _get_read_lambda(self, file_type: str, sep: Optional[str] = None): """ :param file_type: @@ -266,13 +265,27 @@ def __get_skiprows(config: 'YamlMapping'): _header_rows = config.get('header_rows', 1) return int(_header_rows) - 1 # If header_rows = 1, skip none. + def __read_fwf(file: str, config: 'YamlMapping'): + colspec_file = os.path.join(os.path.dirname(self.config.__file__), config.get('colspec_file')) + if colspec_file: + try: + file_format = pd.read_csv(colspec_file) + except FileNotFoundError: + self.error_handler.throw( + f"colspec file {colspec_file} not found" + ) + colnames = file_format.field_name + colspecs = list(zip(file_format.start_index, file_format.end_index)) + return dd.read_fwf(file, colspecs=colspecs, header=config.get('header_rows', "infer"), names=colnames, converters={c:str for c in colnames}) + else: + return dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=config.get('columns'), converters={c:str for c in config.get('columns')}) # We don't want to activate the function inside this helper function. read_lambda_mapping = { 'csv' : lambda file, config: dd.read_csv(file, sep=sep, dtype=str, encoding=config.get('encoding', "utf8"), keep_default_na=False, skiprows=__get_skiprows(config)), 'excel' : lambda file, config: pd.read_excel(file, sheet_name=config.get("sheet", 0), keep_default_na=False), 'feather' : lambda file, _ : pd.read_feather(file), - 'fixedwidth': lambda file, config: dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=config.get('columns'), converters={c:str for c in config.get('columns')}), + 'fixedwidth': __read_fwf, 'html' : lambda file, config: pd.read_html(file, match=config.get('match', ".+"), keep_default_na=False)[0], 'orc' : lambda file, _ : dd.read_orc(file), 'json' : lambda file, config: dd.read_json(file, typ=config.get('object_type', "frame"), orient=config.get('orientation', "columns")), From 2c4240ea8ce385b71a9757007ead7d172b29397a Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Tue, 26 Nov 2024 13:36:58 -0600 Subject: [PATCH 02/11] add note --- earthmover/nodes/source.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index ca73f60..16f652f 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -270,6 +270,8 @@ def __read_fwf(file: str, config: 'YamlMapping'): if colspec_file: try: file_format = pd.read_csv(colspec_file) + # we need to handle this separately because otherwise EM will report that the source file + # (instead of the colspec file) could not be found except FileNotFoundError: self.error_handler.throw( f"colspec file {colspec_file} not found" From 7076b53880590ecc95bcb264390942826e255342 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Tue, 3 Dec 2024 16:42:50 -0600 Subject: [PATCH 03/11] make colspec_file read safer --- earthmover/nodes/source.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index 16f652f..81ae7d9 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -266,15 +266,15 @@ def __get_skiprows(config: 'YamlMapping'): return int(_header_rows) - 1 # If header_rows = 1, skip none. def __read_fwf(file: str, config: 'YamlMapping'): - colspec_file = os.path.join(os.path.dirname(self.config.__file__), config.get('colspec_file')) + colspec_file = config.get('colspec_file') if colspec_file: try: - file_format = pd.read_csv(colspec_file) + file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file)) # we need to handle this separately because otherwise EM will report that the source file # (instead of the colspec file) could not be found except FileNotFoundError: self.error_handler.throw( - f"colspec file {colspec_file} not found" + f"colspec file '{colspec_file}' not found" ) colnames = file_format.field_name colspecs = list(zip(file_format.start_index, file_format.end_index)) From f4bec28ed4c8892713a1705bad8092e552e7342c Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Mon, 16 Dec 2024 17:25:06 -0600 Subject: [PATCH 04/11] add documentation, remove support for user-supplied colspecs --- CHANGELOG.md | 2 +- README.md | 2 +- docs/fixedwidth-sources.md | 64 +++++++++++++++++++++++++++++++++ earthmover/nodes/source.py | 73 ++++++++++++++++++++++++++++---------- 4 files changed, 120 insertions(+), 21 deletions(-) create mode 100644 docs/fixedwidth-sources.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 00a8dec..dafb552 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ### Unreleased changes -* feature: Allow a `colspec_file` config with column info for `fixedwidth` inputs +* feature: Require a `colspec_file` config with column info for `fixedwidth` inputs ### v0.4.1
diff --git a/README.md b/README.md index 58adb01..60dcd1c 100644 --- a/README.md +++ b/README.md @@ -254,7 +254,7 @@ Each source must have a name (which is how it is referenced by transformations a - Row-based formats: - `.csv`: Specify the number of `header_rows`, and (if `header_rows` > 0, optionally) overwrite the `column` names. Optionally specify an `encoding` to use when reading the file (the default is UTF8). - `.tsv`: Specify the number of `header_rows`, and (if `header_rows` > 0, optionally) overwrite the `column` names. Optionally specify an `encoding` to use when reading the file (the default is UTF8). - - `.txt`: a fixed-width text file; column widths are inferred from the first 100 lines. + - `.txt`: a fixed-width text file. See [here](TODO:) for usage information - Column-based formats: `.parquet`, `.feather`, `.orc` — these require the [`pyarrow` library](https://arrow.apache.org/docs/python/index.html), which can be installed with `pip install pyarrow` or similar - Structured formats: - `.json`: Optionally specify a `object_type` (`frame` or `series`) and `orientation` (see [these docs](https://pandas.pydata.org/docs/reference/api/pandas.read_json.html)) to interpret different JSON structures. diff --git a/docs/fixedwidth-sources.md b/docs/fixedwidth-sources.md new file mode 100644 index 0000000..b4ca081 --- /dev/null +++ b/docs/fixedwidth-sources.md @@ -0,0 +1,64 @@ +# Working with fixed-width source files + +One challenge of working with fixed-width files (FWFs) is that they require additional metadata. In particular, any tool that reads a FWF into a tabular structure needs to know how to slice each row into its constituent columns. Earthmover expects this information in the form of a **CSV** called a `colspec_file` + +## Specifying a `colspec_file` + +In your earthmover.yaml config, a `fixedwidth` source is specified much like any other file source. Here is a complete example: +```yaml +sources: + input: + file: ./data/input.txt + colspec_file: ./seed/colspecs.csv # always required + colspec_headers: + name: field_name # always required + start: start_index # required if `width` is not provided + end: end_index # required if `width` is not provided + width: field_length # required if `start` or `end` is not provided + type: fixedwidth # required if `file` does not end with '.txt' + header_rows: 0 +``` + +Some notes on the available options + - (required) `colspec_file`: a path to the CSV containing your colspec metadata + - (required) `colspec_headers`: a mapping between the `colspec_file`'s column names and the fields Earthmover requires. **Note that the names and positions of these columns do not matter** + - Of these, only `name` is always required. Your `colspec_file` should contain a column that assigns a name to each field in the FWF + - You must either provide `width`, or both `start` and `end` + - If you provide `width` your `colspec_file` should include a column of integer values that specifies the number of characters in each field in the FWF + - If you provide `start` and `end`, your `colspec_file` should include two columns of integer values [giving the extents of the FWF's fields as half-open intervals (i.e., \[from, to\[ )](https://pandas.pydata.org/docs/reference/api/pandas.read_fwf.html) + - (optional) `type`: if the input file has a `.txt` extension, you do not need to specify `type`. However, since there is no standard extension for FWFs, it is a good idea to use `type: fixedwidth` + - (optional) `header_rows`: this is almost always 0 for FWFs. Earthmover will usually infer this even if you don't specify it, but we recommend doing so + +## Formatting a `colspec_file` +In accordance with the above, a `colspec_file` must include a column with field names, as well as either a column with field widths, or two columns with start and end positions. Both of the following CSVs are valid and equivalent to one another: + +```csv +name,width +date,8 +id,16 +score_1,2 +score_2,2 +``` +For this file, your earthmover.yaml would look like: +```yaml +colspec_headers: + name: name + width: width +``` + +or + +```csv +start_idx, end_idx, other_data, full_field_name, other_data_2 +0, 8, abc, date, def +8, 24, abc, id, def +24, 26, abc, score_1, def +26, 28, abc, score_2, def +``` +For this file, your earthmover.yaml would look like: +```yaml +colspec_headers: + name: full_field_name + start: start_idx + end: end_idx +``` diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index 81ae7d9..32233db 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -95,7 +95,7 @@ class FileSource(Source): is_remote: bool = False allowed_configs: Tuple[str] = ( 'debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields', - 'file', 'type', 'columns', 'header_rows', 'colspecs', 'colspec_file', 'rename_cols', + 'file', 'type', 'columns', 'header_rows', 'colspec_file', 'colspec_headers', 'rename_cols', 'encoding', 'sheet', 'object_type', 'match', 'orientation', 'xpath', ) @@ -252,6 +252,58 @@ def _get_filetype(file: str): ext = file.lower().rsplit('.', 1)[-1] return ext_mapping.get(ext) + def __read_fwf(self, file: str, config: 'YamlMapping'): + colspec_file = config.get('colspec_file') + if not colspec_file: + self.error_handler.throw( + "`colspec_file` must be specified when using a fixedwidth source" + ) + try: + file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file)) + # we need to handle this separately because otherwise EM will report that the source file + # (instead of the colspec file) could not be found + except FileNotFoundError: + self.error_handler.throw( + f"colspec file '{colspec_file}' not found" + ) + + colspec_headers = config.get("colspec_headers") + if not colspec_headers: + self.error_handler.throw( + "`colspec_headers` must be specified when supplying a colspec file" + ) + + try: + # name column is required + name_col = colspec_headers["name"] + except KeyError: + self.error_handler.throw( + "a `name` column must be provided when supplying colspec_headers" + ) + + start_col = colspec_headers.get("start") + end_col = colspec_headers.get("end") + width_col = colspec_headers.get("width") + # pandas does not allow specifying both start/end and widths, but we just let start/end take precedence + if start_col and end_col: + use_widths = False + elif width_col: + use_widths = True + else: + self.error_handler.throw( + "either `width` or (`start`, `end`) must be specified when supplying colspec_headers" + ) + + names = file_format[name_col] + header = config.get('header_rows', "infer") + converters = {c:str for c in names} + if use_widths: + widths = list(file_format[width_col]) + return dd.read_fwf(file, widths=widths, header=header, names=names, converters=converters) + else: + colspecs = list(zip(file_format.start_index, file_format.end_index)) + return dd.read_fwf(file, colspecs=colspecs, header=header, names=names, converters=converters) + def _get_read_lambda(self, file_type: str, sep: Optional[str] = None): """ @@ -265,29 +317,12 @@ def __get_skiprows(config: 'YamlMapping'): _header_rows = config.get('header_rows', 1) return int(_header_rows) - 1 # If header_rows = 1, skip none. - def __read_fwf(file: str, config: 'YamlMapping'): - colspec_file = config.get('colspec_file') - if colspec_file: - try: - file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file)) - # we need to handle this separately because otherwise EM will report that the source file - # (instead of the colspec file) could not be found - except FileNotFoundError: - self.error_handler.throw( - f"colspec file '{colspec_file}' not found" - ) - colnames = file_format.field_name - colspecs = list(zip(file_format.start_index, file_format.end_index)) - return dd.read_fwf(file, colspecs=colspecs, header=config.get('header_rows', "infer"), names=colnames, converters={c:str for c in colnames}) - else: - return dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=config.get('columns'), converters={c:str for c in config.get('columns')}) - # We don't want to activate the function inside this helper function. read_lambda_mapping = { 'csv' : lambda file, config: dd.read_csv(file, sep=sep, dtype=str, encoding=config.get('encoding', "utf8"), keep_default_na=False, skiprows=__get_skiprows(config)), 'excel' : lambda file, config: pd.read_excel(file, sheet_name=config.get("sheet", 0), keep_default_na=False), 'feather' : lambda file, _ : pd.read_feather(file), - 'fixedwidth': __read_fwf, + 'fixedwidth': self.__read_fwf, 'html' : lambda file, config: pd.read_html(file, match=config.get('match', ".+"), keep_default_na=False)[0], 'orc' : lambda file, _ : dd.read_orc(file), 'json' : lambda file, config: dd.read_json(file, typ=config.get('object_type', "frame"), orient=config.get('orientation', "columns")), From 25136b6633348c70666cd05106c74297e9c080a4 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Mon, 16 Dec 2024 17:28:10 -0600 Subject: [PATCH 05/11] add comment --- earthmover/nodes/source.py | 1 + 1 file changed, 1 insertion(+) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index 32233db..8fc8160 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -259,6 +259,7 @@ def __read_fwf(self, file: str, config: 'YamlMapping'): "`colspec_file` must be specified when using a fixedwidth source" ) try: + # ensure we find the colspec file relative to the config file that references it (in case of project composition) file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file)) # we need to handle this separately because otherwise EM will report that the source file # (instead of the colspec file) could not be found From 6724ed2c2ead7c8496bcdd91e2931331b15e62e2 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Mon, 16 Dec 2024 17:30:27 -0600 Subject: [PATCH 06/11] fix link to doc --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 60dcd1c..0556bb3 100644 --- a/README.md +++ b/README.md @@ -254,7 +254,7 @@ Each source must have a name (which is how it is referenced by transformations a - Row-based formats: - `.csv`: Specify the number of `header_rows`, and (if `header_rows` > 0, optionally) overwrite the `column` names. Optionally specify an `encoding` to use when reading the file (the default is UTF8). - `.tsv`: Specify the number of `header_rows`, and (if `header_rows` > 0, optionally) overwrite the `column` names. Optionally specify an `encoding` to use when reading the file (the default is UTF8). - - `.txt`: a fixed-width text file. See [here](TODO:) for usage information + - `.txt`: a fixed-width text file. See [here](https://github.com/edanalytics/earthmover/blob/main/docs/fixedwidth-sources.md) for usage information - Column-based formats: `.parquet`, `.feather`, `.orc` — these require the [`pyarrow` library](https://arrow.apache.org/docs/python/index.html), which can be installed with `pip install pyarrow` or similar - Structured formats: - `.json`: Optionally specify a `object_type` (`frame` or `series`) and `orientation` (see [these docs](https://pandas.pydata.org/docs/reference/api/pandas.read_json.html)) to interpret different JSON structures. From d907ad1d7e20e1214205ba6722ea0bbdde2d5bcf Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Thu, 9 Jan 2025 09:44:14 -0600 Subject: [PATCH 07/11] restore optional columns/colspec functionality --- earthmover/nodes/source.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index 85ec46e..e63a3c0 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -266,9 +266,11 @@ def _get_filetype(file: str): def __read_fwf(self, file: str, config: 'YamlMapping'): colspec_file = config.get('colspec_file') if not colspec_file: - self.error_handler.throw( - "`colspec_file` must be specified when using a fixedwidth source" - ) + names = config.get('columns') + if not names: + self.error_handler.throw("No `colspec_file` specified for fixedwidth source. In this case, `columns` must be specified, and `colspecs` may be specified, or else will be inferred") + + return dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=names, converters={c:str for c in names}) try: # ensure we find the colspec file relative to the config file that references it (in case of project composition) file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file)) From 7429ef98843ebda70e230b9aa0417e54c4cba494 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Thu, 9 Jan 2025 09:44:54 -0600 Subject: [PATCH 08/11] tweak changelog language --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 08079ae..2f681b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ### Unreleased changes -* feature: Require a `colspec_file` config with column info for `fixedwidth` inputs +* feature: Expect a `colspec_file` config with column info for `fixedwidth` inputs ### v0.4.2
Released 2024-11-15 From d714b889e2aa993db5f11adfc262854b061d536d Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Thu, 9 Jan 2025 11:06:16 -0600 Subject: [PATCH 09/11] fix colspecs --- earthmover/nodes/source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index e63a3c0..d3d881f 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -106,7 +106,7 @@ class FileSource(Source): is_remote: bool = False allowed_configs: Tuple[str] = ( 'debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields', - 'file', 'type', 'columns', 'header_rows', 'colspec_file', 'colspec_headers', 'rename_cols', + 'file', 'type', 'columns', 'header_rows', 'colspec_file', 'colspecs', 'rename_cols', 'encoding', 'sheet', 'object_type', 'match', 'orientation', 'xpath', ) From 32fb7e3944b85760ac8d8b59afb45498659cba90 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Thu, 9 Jan 2025 11:54:13 -0600 Subject: [PATCH 10/11] fix colspecs --- earthmover/nodes/source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index d3d881f..3451cf7 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -106,7 +106,7 @@ class FileSource(Source): is_remote: bool = False allowed_configs: Tuple[str] = ( 'debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields', - 'file', 'type', 'columns', 'header_rows', 'colspec_file', 'colspecs', 'rename_cols', + 'file', 'type', 'columns', 'header_rows', 'colspec_file', 'colspecs', 'colspec_headers', 'rename_cols', 'encoding', 'sheet', 'object_type', 'match', 'orientation', 'xpath', ) From f2e332ed86b54edfcb4e50e668ffb06d89be8ac8 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Thu, 16 Jan 2025 17:05:30 -0600 Subject: [PATCH 11/11] change language --- CHANGELOG.md | 2 +- docs/fixedwidth-sources.md | 39 +++++++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f681b5..620a60a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ### Unreleased changes -* feature: Expect a `colspec_file` config with column info for `fixedwidth` inputs +* feature: Allow a `colspec_file` config with column info for `fixedwidth` inputs ### v0.4.2
Released 2024-11-15 diff --git a/docs/fixedwidth-sources.md b/docs/fixedwidth-sources.md index b4ca081..0124b22 100644 --- a/docs/fixedwidth-sources.md +++ b/docs/fixedwidth-sources.md @@ -1,17 +1,18 @@ # Working with fixed-width source files -One challenge of working with fixed-width files (FWFs) is that they require additional metadata. In particular, any tool that reads a FWF into a tabular structure needs to know how to slice each row into its constituent columns. Earthmover expects this information in the form of a **CSV** called a `colspec_file` +One challenge of working with fixed-width files (FWFs) is that they require additional metadata. In particular, any tool that reads a FWF into a tabular structure needs to know how to slice each row into its constituent columns. Earthmover supports two ways of providing this information: -## Specifying a `colspec_file` +## 1. Provide a `colspec_file` In your earthmover.yaml config, a `fixedwidth` source is specified much like any other file source. Here is a complete example: + ```yaml sources: input: file: ./data/input.txt - colspec_file: ./seed/colspecs.csv # always required + colspec_file: ./seed/colspecs.csv # required colspec_headers: - name: field_name # always required + name: field_name # required start: start_index # required if `width` is not provided end: end_index # required if `width` is not provided width: field_length # required if `start` or `end` is not provided @@ -29,7 +30,7 @@ Some notes on the available options - (optional) `type`: if the input file has a `.txt` extension, you do not need to specify `type`. However, since there is no standard extension for FWFs, it is a good idea to use `type: fixedwidth` - (optional) `header_rows`: this is almost always 0 for FWFs. Earthmover will usually infer this even if you don't specify it, but we recommend doing so -## Formatting a `colspec_file` +### Formatting a `colspec_file` In accordance with the above, a `colspec_file` must include a column with field names, as well as either a column with field widths, or two columns with start and end positions. Both of the following CSVs are valid and equivalent to one another: ```csv @@ -40,6 +41,7 @@ score_1,2 score_2,2 ``` For this file, your earthmover.yaml would look like: + ```yaml colspec_headers: name: name @@ -56,9 +58,36 @@ start_idx, end_idx, other_data, full_field_name, other_data_2 26, 28, abc, score_2, def ``` For this file, your earthmover.yaml would look like: + ```yaml colspec_headers: name: full_field_name start: start_idx end: end_idx ``` + +## 2. Provide `colspecs` and `columns` directly + +Alternatively, you can essentially put the same information in your Earthmover config, like this: + +```yaml +sources: + input: + file: ./data/input.txt + type: fixedwidth # required if `file` does not end with '.txt' + header_rows: 0 + colspecs: # required + - [0, 8] + - [8, 24] + - [24, 26] + - [26, 28] + columns: # required + - date + - id + - score_1 + - score_2 +``` + +Some notes on the available options + - (required) `colspecs`: a list of start/end indices [giving the extents of the FWF's fields as half-open intervals (i.e., \[from, to\[ )](https://pandas.pydata.org/docs/reference/api/pandas.read_fwf.html) + - (required) `columns`: a list of column names corresponding to the indices in `colspecs` \ No newline at end of file