-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature: Allow a colspec_file
config with column info for fixedwidth
inputs
#139
Changes from 3 commits
98af167
2c4240e
7076b53
f4bec28
25136b6
413826a
6724ed2
d907ad1
7429ef9
d714b88
32fb7e3
f2e332e
b104a2a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -95,7 +95,7 @@ class FileSource(Source): | |
is_remote: bool = False | ||
allowed_configs: Tuple[str] = ( | ||
'debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields', | ||
'file', 'type', 'columns', 'header_rows', 'colspecs', 'rename_cols', | ||
'file', 'type', 'columns', 'header_rows', 'colspecs', 'colspec_file', 'rename_cols', | ||
'encoding', 'sheet', 'object_type', 'match', 'orientation', 'xpath', | ||
) | ||
|
||
|
@@ -252,8 +252,7 @@ def _get_filetype(file: str): | |
ext = file.lower().rsplit('.', 1)[-1] | ||
return ext_mapping.get(ext) | ||
|
||
@staticmethod | ||
def _get_read_lambda(file_type: str, sep: Optional[str] = None): | ||
def _get_read_lambda(self, file_type: str, sep: Optional[str] = None): | ||
""" | ||
|
||
:param file_type: | ||
|
@@ -266,13 +265,29 @@ def __get_skiprows(config: 'YamlMapping'): | |
_header_rows = config.get('header_rows', 1) | ||
return int(_header_rows) - 1 # If header_rows = 1, skip none. | ||
|
||
def __read_fwf(file: str, config: 'YamlMapping'): | ||
colspec_file = config.get('colspec_file') | ||
if colspec_file: | ||
try: | ||
file_format = pd.read_csv(os.path.join(os.path.dirname(self.config.__file__), colspec_file)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this approach for ascertaining filepath directories consistent with the rest of the project? If so, maybe we should move this logic to a helper. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I can say whether it's consistent as such; it's needed in order to properly find the colspec file when using project composition. We do use the same construction once elsewhere but that's in a separate class. I've added a comment explaining this but I think it's too rare a usage to justify a separate function as of now |
||
# we need to handle this separately because otherwise EM will report that the source file | ||
# (instead of the colspec file) could not be found | ||
except FileNotFoundError: | ||
self.error_handler.throw( | ||
f"colspec file '{colspec_file}' not found" | ||
) | ||
colnames = file_format.field_name | ||
colspecs = list(zip(file_format.start_index, file_format.end_index)) | ||
return dd.read_fwf(file, colspecs=colspecs, header=config.get('header_rows', "infer"), names=colnames, converters={c:str for c in colnames}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a couple of variables here to clean up these read lines. We should technically be using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you be more specific about what kind of cleanup you want to see? I'm happy to use |
||
else: | ||
return dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=config.get('columns'), converters={c:str for c in config.get('columns')}) | ||
|
||
# We don't want to activate the function inside this helper function. | ||
read_lambda_mapping = { | ||
'csv' : lambda file, config: dd.read_csv(file, sep=sep, dtype=str, encoding=config.get('encoding', "utf8"), keep_default_na=False, skiprows=__get_skiprows(config)), | ||
'excel' : lambda file, config: pd.read_excel(file, sheet_name=config.get("sheet", 0), keep_default_na=False), | ||
'feather' : lambda file, _ : pd.read_feather(file), | ||
'fixedwidth': lambda file, config: dd.read_fwf(file, colspecs=config.get('colspecs', "infer"), header=config.get('header_rows', "infer"), names=config.get('columns'), converters={c:str for c in config.get('columns')}), | ||
'fixedwidth': __read_fwf, | ||
'html' : lambda file, config: pd.read_html(file, match=config.get('match', ".+"), keep_default_na=False)[0], | ||
'orc' : lambda file, _ : dd.read_orc(file), | ||
'json' : lambda file, config: dd.read_json(file, typ=config.get('object_type', "frame"), orient=config.get('orientation', "columns")), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should define any helper methods like these outside the
__get_read_lambda()
helper.I understand the purpose of what we're doing here, but it does not spark joy. We are defining our own filespec for documenting FWF headers. There are a couple of improvements I might suggest: