From 98af16741b269d986808b4f6956b0b8b173fd0f2 Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Tue, 26 Nov 2024 13:29:02 -0600
Subject: [PATCH 01/11] update changelog

---
 CHANGELOG.md               |  4 ++++
 earthmover/nodes/source.py | 21 +++++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c61df96..00a8dec 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+### Unreleased changes
+
+* feature: Allow a `colspec_file` config with column info for `fixedwidth` inputs
+
 ### v0.4.1
 <details>
 <summary>Released 2024-11-15</summary>
diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py
index e81d5e0..ca73f60 100644
--- a/earthmover/nodes/source.py
+++ b/earthmover/nodes/source.py
@@ -95,7 +95,7 @@ class FileSource(Source):
     is_remote: bool = False
     allowed_configs: Tuple[str] = (
         'debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields',
-        'file', 'type', 'columns', 'header_rows', 'colspecs', 'rename_cols',
+        'file', 'type', 'columns', 'header_rows', 'colspecs', 'colspec_file', 'rename_cols',
         'encoding', 'sheet', 'object_type', 'match', 'orientation', 'xpath',
     )
 
@@ -252,8 +252,7 @@ def _get_filetype(file: str):
         ext = file.lower().rsplit('.', 1)[-1]
         return ext_mapping.get(ext)
 
-    @staticmethod
-    def _get_read_lambda(file_type: str, sep: Optional[str] = None):
+    def _get_read_lambda(self, file_type: str, sep: Optional[str] = None):
         """
 
         :param file_type:
@@ -266,13 +265,27 @@ def __get_skiprows(config: 'YamlMapping'):
             _header_rows = config.get('header_rows', 1)
             return int(_header_rows) - 1  # If header_rows = 1, skip none.
 
+        def __read_fwf(file: str, config: 'YamlMapping'):
+            colspec_file = os.path.join(os.path.dirname(self.config.__file__), config.get('colspec_file'))
+            if colspec_file:
+                try:
+                    file_format = pd.read_csv(colspec_file)
+                except FileNotFoundError:
+                    self.error_handler.throw(
+                        f"colspec file {colspec_file} not found"
+                    )
+                colnames = file_format.field_name
+                colspecs = list(zip(file_format.start_index, file_format.end_index))
+                return dd.read_fwf(file, colspecs=colspecs, header=config.get('header_rows', "infer"), names=colnames, converters={c:str for c in colnames})
+            else:
+                return dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=config.get('columns'), converters={c:str for c in config.get('columns')})
 
         # We don't want to activate the function inside this helper function.
         read_lambda_mapping = {
             'csv'       : lambda file, config: dd.read_csv(file, sep=sep, dtype=str, encoding=config.get('encoding', "utf8"), keep_default_na=False, skiprows=__get_skiprows(config)),
             'excel'     : lambda file, config: pd.read_excel(file, sheet_name=config.get("sheet", 0), keep_default_na=False),
             'feather'   : lambda file, _     : pd.read_feather(file),
-            'fixedwidth': lambda file, config: dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=config.get('columns'), converters={c:str for c in config.get('columns')}),
+            'fixedwidth': __read_fwf,
             'html'      : lambda file, config: pd.read_html(file, match=config.get('match', ".+"), keep_default_na=False)[0],
             'orc'       : lambda file, _     : dd.read_orc(file),
             'json'      : lambda file, config: dd.read_json(file, typ=config.get('object_type', "frame"), orient=config.get('orientation', "columns")),

From 2c4240ea8ce385b71a9757007ead7d172b29397a Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Tue, 26 Nov 2024 13:36:58 -0600
Subject: [PATCH 02/11] add note

---
 earthmover/nodes/source.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py
index ca73f60..16f652f 100644
--- a/earthmover/nodes/source.py
+++ b/earthmover/nodes/source.py
@@ -270,6 +270,8 @@ def __read_fwf(file: str, config: 'YamlMapping'):
             if colspec_file:
                 try:
                     file_format = pd.read_csv(colspec_file)
+                # we need to handle this separately because otherwise EM will report that the source file
+                # (instead of the colspec file) could not be found
                 except FileNotFoundError:
                     self.error_handler.throw(
                         f"colspec file {colspec_file} not found"

From 7076b53880590ecc95bcb264390942826e255342 Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Tue, 3 Dec 2024 16:42:50 -0600
Subject: [PATCH 03/11] make colspec_file read safer

---
 earthmover/nodes/source.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py
index 16f652f..81ae7d9 100644
--- a/earthmover/nodes/source.py
+++ b/earthmover/nodes/source.py
@@ -266,15 +266,15 @@ def __get_skiprows(config: 'YamlMapping'):
             return int(_header_rows) - 1  # If header_rows = 1, skip none.
 
         def __read_fwf(file: str, config: 'YamlMapping'):
-            colspec_file = os.path.join(os.path.dirname(self.config.__file__), config.get('colspec_file'))
+            colspec_file = config.get('colspec_file')
             if colspec_file:
                 try:
-                    file_format = pd.read_csv(colspec_file)
+                    file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file))
                 # we need to handle this separately because otherwise EM will report that the source file
                 # (instead of the colspec file) could not be found
                 except FileNotFoundError:
                     self.error_handler.throw(
-                        f"colspec file {colspec_file} not found"
+                        f"colspec file '{colspec_file}' not found"
                     )
                 colnames = file_format.field_name
                 colspecs = list(zip(file_format.start_index, file_format.end_index))

From f4bec28ed4c8892713a1705bad8092e552e7342c Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Mon, 16 Dec 2024 17:25:06 -0600
Subject: [PATCH 04/11] add documentation, remove support for user-supplied
 colspecs

---
 CHANGELOG.md               |  2 +-
 README.md                  |  2 +-
 docs/fixedwidth-sources.md | 64 +++++++++++++++++++++++++++++++++
 earthmover/nodes/source.py | 73 ++++++++++++++++++++++++++++----------
 4 files changed, 120 insertions(+), 21 deletions(-)
 create mode 100644 docs/fixedwidth-sources.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 00a8dec..dafb552 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 ### Unreleased changes
 
-* feature: Allow a `colspec_file` config with column info for `fixedwidth` inputs
+* feature: Require a `colspec_file` config with column info for `fixedwidth` inputs
 
 ### v0.4.1
 <details>
diff --git a/README.md b/README.md
index 58adb01..60dcd1c 100644
--- a/README.md
+++ b/README.md
@@ -254,7 +254,7 @@ Each source must have a name (which is how it is referenced by transformations a
   - Row-based formats:
     - `.csv`: Specify the number of `header_rows`, and (if `header_rows` > 0, optionally) overwrite the `column` names. Optionally specify an `encoding` to use when reading the file (the default is UTF8).
     - `.tsv`: Specify the number of `header_rows`, and (if `header_rows` > 0, optionally) overwrite the `column` names. Optionally specify an `encoding` to use when reading the file (the default is UTF8).
-    - `.txt`: a fixed-width text file; column widths are inferred from the first 100 lines.
+    - `.txt`: a fixed-width text file. See [here](TODO:) for usage information
   - Column-based formats: `.parquet`, `.feather`, `.orc` &mdash; these require the [`pyarrow` library](https://arrow.apache.org/docs/python/index.html), which can be installed with `pip install pyarrow` or similar
   - Structured formats:
     - `.json`: Optionally specify a `object_type` (`frame` or `series`) and `orientation` (see [these docs](https://pandas.pydata.org/docs/reference/api/pandas.read_json.html)) to interpret different JSON structures.
diff --git a/docs/fixedwidth-sources.md b/docs/fixedwidth-sources.md
new file mode 100644
index 0000000..b4ca081
--- /dev/null
+++ b/docs/fixedwidth-sources.md
@@ -0,0 +1,64 @@
+# Working with fixed-width source files
+
+One challenge of working with fixed-width files (FWFs) is that they require additional metadata. In particular, any tool that reads a FWF into a tabular structure needs to know how to slice each row into its constituent columns. Earthmover expects this information in the form of a **CSV** called a `colspec_file`
+
+## Specifying a `colspec_file`
+
+In your earthmover.yaml config, a `fixedwidth` source is specified much like any other file source. Here is a complete example:
+```yaml
+sources:
+  input:
+    file: ./data/input.txt
+    colspec_file: ./seed/colspecs.csv   # always required
+    colspec_headers:
+      name: field_name                  # always required
+      start: start_index                # required if `width` is not provided
+      end: end_index                    # required if `width` is not provided
+      width: field_length               # required if `start` or `end` is not provided
+    type: fixedwidth                    # required if `file` does not end with '.txt'
+    header_rows: 0
+```
+
+Some notes on the available options
+  - (required) `colspec_file`: a path to the CSV containing your colspec metadata
+  - (required) `colspec_headers`: a mapping between the `colspec_file`'s column names and the fields Earthmover requires. **Note that the names and positions of these columns do not matter**
+    - Of these, only `name` is always required. Your `colspec_file` should contain a column that assigns a name to each field in the FWF
+    - You must either provide `width`, or both `start` and `end`
+    - If you provide `width` your `colspec_file` should include a column of integer values that specifies the number of characters in each field in the FWF
+    - If you provide `start` and `end`, your `colspec_file` should include two columns of integer values [giving the extents of the FWF's fields as half-open intervals (i.e., \[from, to\[ )](https://pandas.pydata.org/docs/reference/api/pandas.read_fwf.html) 
+  - (optional) `type`: if the input file has a `.txt` extension, you do not need to specify `type`. However, since there is no standard extension for FWFs, it is a good idea to use `type: fixedwidth`
+  - (optional) `header_rows`: this is almost always 0 for FWFs. Earthmover will usually infer this even if you don't specify it, but we recommend doing so
+
+## Formatting a `colspec_file`
+In accordance with the above, a `colspec_file` must include a column with field names, as well as either a column with field widths, or two columns with start and end positions.  Both of the following CSVs are valid and equivalent to one another:
+
+```csv
+name,width
+date,8
+id,16
+score_1,2
+score_2,2
+```
+For this file, your earthmover.yaml would look like:
+```yaml
+colspec_headers:
+  name: name
+  width: width
+```
+
+or
+
+```csv
+start_idx, end_idx, other_data, full_field_name, other_data_2
+0, 8, abc, date, def
+8, 24, abc, id, def
+24, 26, abc, score_1, def
+26, 28, abc, score_2, def
+```
+For this file, your earthmover.yaml would look like:
+```yaml
+colspec_headers:
+  name: full_field_name
+  start: start_idx
+  end: end_idx
+```
diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py
index 81ae7d9..32233db 100644
--- a/earthmover/nodes/source.py
+++ b/earthmover/nodes/source.py
@@ -95,7 +95,7 @@ class FileSource(Source):
     is_remote: bool = False
     allowed_configs: Tuple[str] = (
         'debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields',
-        'file', 'type', 'columns', 'header_rows', 'colspecs', 'colspec_file', 'rename_cols',
+        'file', 'type', 'columns', 'header_rows', 'colspec_file', 'colspec_headers', 'rename_cols',
         'encoding', 'sheet', 'object_type', 'match', 'orientation', 'xpath',
     )
 
@@ -252,6 +252,58 @@ def _get_filetype(file: str):
         ext = file.lower().rsplit('.', 1)[-1]
         return ext_mapping.get(ext)
 
+    def __read_fwf(self, file: str, config: 'YamlMapping'):
+        colspec_file = config.get('colspec_file')
+        if not colspec_file:
+            self.error_handler.throw(
+                "`colspec_file` must be specified when using a fixedwidth source"
+            )
+        try:
+            file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file))
+        # we need to handle this separately because otherwise EM will report that the source file
+        # (instead of the colspec file) could not be found
+        except FileNotFoundError:
+            self.error_handler.throw(
+                f"colspec file '{colspec_file}' not found"
+            )
+
+        colspec_headers = config.get("colspec_headers")
+        if not colspec_headers:
+            self.error_handler.throw(
+                "`colspec_headers` must be specified when supplying a colspec file"
+            )
+
+        try:
+            # name column is required
+            name_col = colspec_headers["name"]
+        except KeyError:
+            self.error_handler.throw(
+                "a `name` column must be provided when supplying colspec_headers"
+            )
+
+        start_col = colspec_headers.get("start")
+        end_col = colspec_headers.get("end")
+        width_col = colspec_headers.get("width")
+        # pandas does not allow specifying both start/end and widths, but we just let start/end take precedence
+        if start_col and end_col:
+            use_widths = False
+        elif width_col:
+            use_widths = True
+        else:
+            self.error_handler.throw(
+                "either `width` or (`start`, `end`) must be specified when supplying colspec_headers"
+            )
+
+        names = file_format[name_col]
+        header = config.get('header_rows', "infer")
+        converters = {c:str for c in names}
+        if use_widths:
+            widths = list(file_format[width_col])
+            return dd.read_fwf(file, widths=widths, header=header, names=names, converters=converters)
+        else:
+            colspecs = list(zip(file_format.start_index, file_format.end_index))
+            return dd.read_fwf(file, colspecs=colspecs, header=header, names=names, converters=converters)
+
     def _get_read_lambda(self, file_type: str, sep: Optional[str] = None):
         """
 
@@ -265,29 +317,12 @@ def __get_skiprows(config: 'YamlMapping'):
             _header_rows = config.get('header_rows', 1)
             return int(_header_rows) - 1  # If header_rows = 1, skip none.
 
-        def __read_fwf(file: str, config: 'YamlMapping'):
-            colspec_file = config.get('colspec_file')
-            if colspec_file:
-                try:
-                    file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file))
-                # we need to handle this separately because otherwise EM will report that the source file
-                # (instead of the colspec file) could not be found
-                except FileNotFoundError:
-                    self.error_handler.throw(
-                        f"colspec file '{colspec_file}' not found"
-                    )
-                colnames = file_format.field_name
-                colspecs = list(zip(file_format.start_index, file_format.end_index))
-                return dd.read_fwf(file, colspecs=colspecs, header=config.get('header_rows', "infer"), names=colnames, converters={c:str for c in colnames})
-            else:
-                return dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=config.get('columns'), converters={c:str for c in config.get('columns')})
-
         # We don't want to activate the function inside this helper function.
         read_lambda_mapping = {
             'csv'       : lambda file, config: dd.read_csv(file, sep=sep, dtype=str, encoding=config.get('encoding', "utf8"), keep_default_na=False, skiprows=__get_skiprows(config)),
             'excel'     : lambda file, config: pd.read_excel(file, sheet_name=config.get("sheet", 0), keep_default_na=False),
             'feather'   : lambda file, _     : pd.read_feather(file),
-            'fixedwidth': __read_fwf,
+            'fixedwidth': self.__read_fwf,
             'html'      : lambda file, config: pd.read_html(file, match=config.get('match', ".+"), keep_default_na=False)[0],
             'orc'       : lambda file, _     : dd.read_orc(file),
             'json'      : lambda file, config: dd.read_json(file, typ=config.get('object_type', "frame"), orient=config.get('orientation', "columns")),

From 25136b6633348c70666cd05106c74297e9c080a4 Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Mon, 16 Dec 2024 17:28:10 -0600
Subject: [PATCH 05/11] add comment

---
 earthmover/nodes/source.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py
index 32233db..8fc8160 100644
--- a/earthmover/nodes/source.py
+++ b/earthmover/nodes/source.py
@@ -259,6 +259,7 @@ def __read_fwf(self, file: str, config: 'YamlMapping'):
                 "`colspec_file` must be specified when using a fixedwidth source"
             )
         try:
+            # ensure we find the colspec file relative to the config file that references it (in case of project composition)
             file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file))
         # we need to handle this separately because otherwise EM will report that the source file
         # (instead of the colspec file) could not be found

From 6724ed2c2ead7c8496bcdd91e2931331b15e62e2 Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Mon, 16 Dec 2024 17:30:27 -0600
Subject: [PATCH 06/11] fix link to doc

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 60dcd1c..0556bb3 100644
--- a/README.md
+++ b/README.md
@@ -254,7 +254,7 @@ Each source must have a name (which is how it is referenced by transformations a
   - Row-based formats:
     - `.csv`: Specify the number of `header_rows`, and (if `header_rows` > 0, optionally) overwrite the `column` names. Optionally specify an `encoding` to use when reading the file (the default is UTF8).
     - `.tsv`: Specify the number of `header_rows`, and (if `header_rows` > 0, optionally) overwrite the `column` names. Optionally specify an `encoding` to use when reading the file (the default is UTF8).
-    - `.txt`: a fixed-width text file. See [here](TODO:) for usage information
+    - `.txt`: a fixed-width text file. See [here](https://github.com/edanalytics/earthmover/blob/main/docs/fixedwidth-sources.md) for usage information
   - Column-based formats: `.parquet`, `.feather`, `.orc` &mdash; these require the [`pyarrow` library](https://arrow.apache.org/docs/python/index.html), which can be installed with `pip install pyarrow` or similar
   - Structured formats:
     - `.json`: Optionally specify a `object_type` (`frame` or `series`) and `orientation` (see [these docs](https://pandas.pydata.org/docs/reference/api/pandas.read_json.html)) to interpret different JSON structures.

From d907ad1d7e20e1214205ba6722ea0bbdde2d5bcf Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Thu, 9 Jan 2025 09:44:14 -0600
Subject: [PATCH 07/11] restore optional columns/colspec functionality

---
 earthmover/nodes/source.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py
index 85ec46e..e63a3c0 100644
--- a/earthmover/nodes/source.py
+++ b/earthmover/nodes/source.py
@@ -266,9 +266,11 @@ def _get_filetype(file: str):
     def __read_fwf(self, file: str, config: 'YamlMapping'):
         colspec_file = config.get('colspec_file')
         if not colspec_file:
-            self.error_handler.throw(
-                "`colspec_file` must be specified when using a fixedwidth source"
-            )
+            names = config.get('columns')
+            if not names:
+                self.error_handler.throw("No `colspec_file` specified for fixedwidth source. In this case, `columns` must be specified, and `colspecs` may be specified, or else will be inferred")
+
+            return dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=names, converters={c:str for c in names})
         try:
             # ensure we find the colspec file relative to the config file that references it (in case of project composition)
             file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file))

From 7429ef98843ebda70e230b9aa0417e54c4cba494 Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Thu, 9 Jan 2025 09:44:54 -0600
Subject: [PATCH 08/11] tweak changelog language

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 08079ae..2f681b5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 ### Unreleased changes
 
-* feature: Require a `colspec_file` config with column info for `fixedwidth` inputs
+* feature: Expect a `colspec_file` config with column info for `fixedwidth` inputs
 ### v0.4.2
 <details>
 <summary>Released 2024-11-15</summary>

From d714b889e2aa993db5f11adfc262854b061d536d Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Thu, 9 Jan 2025 11:06:16 -0600
Subject: [PATCH 09/11] fix colspecs

---
 earthmover/nodes/source.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py
index e63a3c0..d3d881f 100644
--- a/earthmover/nodes/source.py
+++ b/earthmover/nodes/source.py
@@ -106,7 +106,7 @@ class FileSource(Source):
     is_remote: bool = False
     allowed_configs: Tuple[str] = (
         'debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields',
-        'file', 'type', 'columns', 'header_rows', 'colspec_file', 'colspec_headers', 'rename_cols',
+        'file', 'type', 'columns', 'header_rows', 'colspec_file', 'colspecs', 'rename_cols',
         'encoding', 'sheet', 'object_type', 'match', 'orientation', 'xpath',
     )
 

From 32fb7e3944b85760ac8d8b59afb45498659cba90 Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Thu, 9 Jan 2025 11:54:13 -0600
Subject: [PATCH 10/11] fix colspecs

---
 earthmover/nodes/source.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py
index d3d881f..3451cf7 100644
--- a/earthmover/nodes/source.py
+++ b/earthmover/nodes/source.py
@@ -106,7 +106,7 @@ class FileSource(Source):
     is_remote: bool = False
     allowed_configs: Tuple[str] = (
         'debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields',
-        'file', 'type', 'columns', 'header_rows', 'colspec_file', 'colspecs', 'rename_cols',
+        'file', 'type', 'columns', 'header_rows', 'colspec_file', 'colspecs', 'colspec_headers', 'rename_cols',
         'encoding', 'sheet', 'object_type', 'match', 'orientation', 'xpath',
     )
 

From f2e332ed86b54edfcb4e50e668ffb06d89be8ac8 Mon Sep 17 00:00:00 2001
From: johncmerfeld <John.Merfeld@gmail.com>
Date: Thu, 16 Jan 2025 17:05:30 -0600
Subject: [PATCH 11/11] change language

---
 CHANGELOG.md               |  2 +-
 docs/fixedwidth-sources.md | 39 +++++++++++++++++++++++++++++++++-----
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2f681b5..620a60a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 ### Unreleased changes
 
-* feature: Expect a `colspec_file` config with column info for `fixedwidth` inputs
+* feature: Allow a `colspec_file` config with column info for `fixedwidth` inputs
 ### v0.4.2
 <details>
 <summary>Released 2024-11-15</summary>
diff --git a/docs/fixedwidth-sources.md b/docs/fixedwidth-sources.md
index b4ca081..0124b22 100644
--- a/docs/fixedwidth-sources.md
+++ b/docs/fixedwidth-sources.md
@@ -1,17 +1,18 @@
 # Working with fixed-width source files
 
-One challenge of working with fixed-width files (FWFs) is that they require additional metadata. In particular, any tool that reads a FWF into a tabular structure needs to know how to slice each row into its constituent columns. Earthmover expects this information in the form of a **CSV** called a `colspec_file`
+One challenge of working with fixed-width files (FWFs) is that they require additional metadata. In particular, any tool that reads a FWF into a tabular structure needs to know how to slice each row into its constituent columns. Earthmover supports two ways of providing this information:
 
-## Specifying a `colspec_file`
+## 1. Provide a `colspec_file`
 
 In your earthmover.yaml config, a `fixedwidth` source is specified much like any other file source. Here is a complete example:
+
 ```yaml
 sources:
   input:
     file: ./data/input.txt
-    colspec_file: ./seed/colspecs.csv   # always required
+    colspec_file: ./seed/colspecs.csv   # required
     colspec_headers:
-      name: field_name                  # always required
+      name: field_name                  # required
       start: start_index                # required if `width` is not provided
       end: end_index                    # required if `width` is not provided
       width: field_length               # required if `start` or `end` is not provided
@@ -29,7 +30,7 @@ Some notes on the available options
   - (optional) `type`: if the input file has a `.txt` extension, you do not need to specify `type`. However, since there is no standard extension for FWFs, it is a good idea to use `type: fixedwidth`
   - (optional) `header_rows`: this is almost always 0 for FWFs. Earthmover will usually infer this even if you don't specify it, but we recommend doing so
 
-## Formatting a `colspec_file`
+### Formatting a `colspec_file`
 In accordance with the above, a `colspec_file` must include a column with field names, as well as either a column with field widths, or two columns with start and end positions.  Both of the following CSVs are valid and equivalent to one another:
 
 ```csv
@@ -40,6 +41,7 @@ score_1,2
 score_2,2
 ```
 For this file, your earthmover.yaml would look like:
+
 ```yaml
 colspec_headers:
   name: name
@@ -56,9 +58,36 @@ start_idx, end_idx, other_data, full_field_name, other_data_2
 26, 28, abc, score_2, def
 ```
 For this file, your earthmover.yaml would look like:
+
 ```yaml
 colspec_headers:
   name: full_field_name
   start: start_idx
   end: end_idx
 ```
+
+## 2. Provide `colspecs` and `columns` directly
+
+Alternatively, you can essentially put the same information in your Earthmover config, like this:
+
+```yaml
+sources:
+  input:
+    file: ./data/input.txt
+    type: fixedwidth            # required if `file` does not end with '.txt'
+    header_rows: 0
+    colspecs:                   # required
+      - [0, 8]
+      - [8, 24]
+      - [24, 26]
+      - [26, 28]
+    columns:                    # required
+      - date
+      - id
+      - score_1
+      - score_2
+```
+
+Some notes on the available options
+  - (required) `colspecs`: a list of start/end indices [giving the extents of the FWF's fields as half-open intervals (i.e., \[from, to\[ )](https://pandas.pydata.org/docs/reference/api/pandas.read_fwf.html) 
+  - (required) `columns`: a list of column names corresponding to the indices in `colspecs`
\ No newline at end of file