edanalytics · tomreitz · Jan 23, 2025 · Nov 26, 2024 · Nov 26, 2024 · Dec 3, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+### Unreleased changes
+
+* feature: Allow a `colspec_file` config with column info for `fixedwidth` inputs
+
 ### v0.4.1
 <details>
 <summary>Released 2024-11-15</summary>

diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py
@@ -95,7 +95,7 @@ class FileSource(Source):
     is_remote: bool = False
     allowed_configs: Tuple[str] = (
         'debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields',
-        'file', 'type', 'columns', 'header_rows', 'colspecs', 'rename_cols',
+        'file', 'type', 'columns', 'header_rows', 'colspecs', 'colspec_file', 'rename_cols',
         'encoding', 'sheet', 'object_type', 'match', 'orientation', 'xpath',
     )
 
@@ -252,8 +252,7 @@ def _get_filetype(file: str):
         ext = file.lower().rsplit('.', 1)[-1]
         return ext_mapping.get(ext)
 
-    @staticmethod
-    def _get_read_lambda(file_type: str, sep: Optional[str] = None):
+    def _get_read_lambda(self, file_type: str, sep: Optional[str] = None):
         """
 
         :param file_type:
@@ -266,13 +265,29 @@ def __get_skiprows(config: 'YamlMapping'):
             _header_rows = config.get('header_rows', 1)
             return int(_header_rows) - 1  # If header_rows = 1, skip none.
 
+        def __read_fwf(file: str, config: 'YamlMapping'):
+            colspec_file = config.get('colspec_file')
+            if colspec_file:
+                try:
+                    file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file))
+                # we need to handle this separately because otherwise EM will report that the source file
+                # (instead of the colspec file) could not be found
+                except FileNotFoundError:
+                    self.error_handler.throw(
+                        f"colspec file '{colspec_file}' not found"
+                    )
+                colnames = file_format.field_name
+                colspecs = list(zip(file_format.start_index, file_format.end_index))
+                return dd.read_fwf(file, colspecs=colspecs, header=config.get('header_rows', "infer"), names=colnames, converters={c:str for c in colnames})
+            else:
+                return dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=config.get('columns'), converters={c:str for c in config.get('columns')})
 
         # We don't want to activate the function inside this helper function.
         read_lambda_mapping = {
             'csv'       : lambda file, config: dd.read_csv(file, sep=sep, dtype=str, encoding=config.get('encoding', "utf8"), keep_default_na=False, skiprows=__get_skiprows(config)),
             'excel'     : lambda file, config: pd.read_excel(file, sheet_name=config.get("sheet", 0), keep_default_na=False),
             'feather'   : lambda file, _     : pd.read_feather(file),
-            'fixedwidth': lambda file, config: dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=config.get('columns'), converters={c:str for c in config.get('columns')}),
+            'fixedwidth': __read_fwf,
             'html'      : lambda file, config: pd.read_html(file, match=config.get('match', ".+"), keep_default_na=False)[0],
             'orc'       : lambda file, _     : dd.read_orc(file),
             'json'      : lambda file, config: dd.read_json(file, typ=config.get('object_type', "frame"), orient=config.get('orientation', "columns")),