From ef0f3a302200c743c81507d9e3435c26a276dcb0 Mon Sep 17 00:00:00 2001 From: Bastien Date: Wed, 16 Dec 2020 18:10:20 +0100 Subject: [PATCH 1/7] =?UTF-8?q?Extends=20data=20types=20support=20(date,?= =?UTF-8?q?=20time,=20=E2=80=A6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/bin/extract_metadata.c | 289 ++++++++++++++++++++++++++++++------- 1 file changed, 240 insertions(+), 49 deletions(-) diff --git a/src/bin/extract_metadata.c b/src/bin/extract_metadata.c index 71c2ac06..9d83a2a0 100644 --- a/src/bin/extract_metadata.c +++ b/src/bin/extract_metadata.c @@ -14,37 +14,73 @@ #include "write/json/write_missing_values.h" #include "write/json/write_value_labels.h" -static const char* readstat_type_str(readstat_type_t type) { - if (type == READSTAT_TYPE_STRING) { - return "READSTAT_TYPE_STRING"; - } - - if (type == READSTAT_TYPE_INT8) { - return "READSTAT_TYPE_INT8"; +typedef enum extract_metadata_type_e { + EXTRACT_METADATA_TYPE_NUMERIC, + EXTRACT_METADATA_TYPE_STRING, + EXTRACT_METADATA_TYPE_UNKNOWN +} extract_metadata_type_t; + +static const char* extract_metadata_type_str(extract_metadata_type_t t) { + switch (t) { + case EXTRACT_METADATA_TYPE_NUMERIC: + return "NUMERIC"; + case EXTRACT_METADATA_TYPE_STRING: + return "STRING"; + case EXTRACT_METADATA_TYPE_UNKNOWN: + return "UNKNOWN"; } + return "UNKNOWN"; +} - if (type == READSTAT_TYPE_INT16) { - return "READSTAT_TYPE_INT16"; +typedef enum extract_metadata_format_e { + EXTRACT_METADATA_FORMAT_NUMBER, + EXTRACT_METADATA_FORMAT_PERCENT, + EXTRACT_METADATA_FORMAT_CURRENCY, + EXTRACT_METADATA_FORMAT_DATE, + EXTRACT_METADATA_FORMAT_TIME, + EXTRACT_METADATA_FORMAT_DATE_TIME, + EXTRACT_METADATA_FORMAT_UNSPECIFIED +} extract_metadata_format_t; + +static const char* extract_metadata_format_str(extract_metadata_format_t format) { + switch (format) { + case EXTRACT_METADATA_FORMAT_NUMBER: + return "NUMBER"; + case EXTRACT_METADATA_FORMAT_PERCENT: + return "PERCENT"; + case EXTRACT_METADATA_FORMAT_CURRENCY: + return "CURRENCY"; + case EXTRACT_METADATA_FORMAT_DATE: + return "DATE"; + case EXTRACT_METADATA_FORMAT_TIME: + return "TIME"; + case EXTRACT_METADATA_FORMAT_DATE_TIME: + return "DATE_TIME"; + case EXTRACT_METADATA_FORMAT_UNSPECIFIED: + return "UNSPECIFIED"; } + return "UNSPECIFIED"; +} - if (type == READSTAT_TYPE_INT32) { +static const char* readstat_type_str(readstat_type_t type) { + switch (type) { + case READSTAT_TYPE_STRING: + return "READSTAT_TYPE_STRING"; + case READSTAT_TYPE_INT8: + return "READSTAT_TYPE_INT8"; + case READSTAT_TYPE_INT16: + return "READSTAT_TYPE_INT16"; + case READSTAT_TYPE_INT32: return "READSTAT_TYPE_INT32"; - } - - if (type == READSTAT_TYPE_FLOAT) { + case READSTAT_TYPE_FLOAT: return "READSTAT_TYPE_FLOAT"; - } - - if (type == READSTAT_TYPE_DOUBLE) { + case READSTAT_TYPE_DOUBLE: return "READSTAT_TYPE_DOUBLE"; - } - - if (type == READSTAT_TYPE_STRING_REF) { + case READSTAT_TYPE_STRING_REF: return "READSTAT_TYPE_STRING_REF"; } - return "UNKNOWN TYPE"; -} +} static int extract_decimals(const char *s, char prefix) { if (s && s[0] && s[0]==prefix) { @@ -63,27 +99,149 @@ static int extract_decimals(const char *s, char prefix) { } } +int hasPrefix(const char *str, char *prefix) { + return strncmp(str, prefix, sizeof(prefix)-1); +} + static int handle_variable_sav(int index, readstat_variable_t *variable, const char *val_labels, struct context *ctx) { - char* type = ""; - const char *format = readstat_variable_get_format(variable); - const char *label = readstat_variable_get_label(variable); + extract_metadata_type_t type = EXTRACT_METADATA_TYPE_UNKNOWN; + extract_metadata_format_t format = EXTRACT_METADATA_FORMAT_UNSPECIFIED; + char *pattern = ""; + int decimals = -1; + const char *vformat = readstat_variable_get_format(variable); + const char *label = readstat_variable_get_label(variable); - if (readstat_variable_get_type_class(variable) == READSTAT_TYPE_CLASS_STRING) { - type = "STRING"; - } else if (readstat_variable_get_type_class(variable) == READSTAT_TYPE_CLASS_NUMERIC) { - if (format && (strncmp(format, "DATE", sizeof("DATE")-1) == 0 || - strncmp(format, "ADATE", sizeof("ADATE")-1) == 0 || - strncmp(format, "EDATE", sizeof("EDATE")-1) == 0 || - strncmp(format, "SDATE", sizeof("SDATE")-1) == 0)) { - type = "DATE"; + switch (readstat_variable_get_type_class(variable)) { + case READSTAT_TYPE_CLASS_STRING: + type = EXTRACT_METADATA_TYPE_STRING; + break; + case READSTAT_TYPE_CLASS_NUMERIC: + type = EXTRACT_METADATA_TYPE_NUMERIC; + + // Extract format + // SPSS data types: https://libguides.library.kent.edu/SPSS/DatesTime + // Pattern formats: https://developers.google.com/sheets/api/guides/formats + // TODO: Extract currency + if (vformat) { + if (hasPrefix(vformat, "DATE9") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "d-m-y"; + } else if (hasPrefix(vformat, "DATE11") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "dd-m-yyyy+"; + } else if (hasPrefix(vformat, "ADATE8") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "m/d/y"; + } else if (hasPrefix(vformat, "ADATE10") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "m/d/yyyy+"; + } else if (hasPrefix(vformat, "EDATE8") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "d.m.y"; + } else if (hasPrefix(vformat, "EDATE10") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "d.m.yyyy+"; + } else if (hasPrefix(vformat, "SDATE8") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "y/m/d"; + } else if (hasPrefix(vformat, "SDATE10") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "yyyy+/m/d"; + } else if (hasPrefix(vformat, "DATETIME17") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE_TIME; + pattern = "d-mm-yyyy+ h:mm"; + } else if (hasPrefix(vformat, "DATETIME20") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE_TIME; + pattern = "d-mm-yyyy+ h:mm:ss"; + } else if (hasPrefix(vformat, "DATETIME23.2") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE_TIME; + pattern = "d-mm-yyy+ h:mm:ss"; + } else if (hasPrefix(vformat, "YMDHMS16") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE_TIME; + pattern = "yyyy+-m-d h:mm"; + } else if (hasPrefix(vformat, "YMDHMS19") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE_TIME; + pattern = "yyyy+-m-d h:mm:ss"; + } else if (hasPrefix(vformat, "YMDHMS19.2") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE_TIME; + pattern = "yyyy+-m-d h:mm:ss"; + } else if (hasPrefix(vformat, "MTIME5") == 0) { + format = EXTRACT_METADATA_FORMAT_TIME; + pattern = "[m+]:[s+]"; + } else if (hasPrefix(vformat, "MTIME8.2") == 0) { + format = EXTRACT_METADATA_FORMAT_TIME; + pattern = "[m+]:[s+]"; + } else if (hasPrefix(vformat, "TIME5") == 0) { + format = EXTRACT_METADATA_FORMAT_TIME; + pattern = "[h+]:[m+]"; + } else if (hasPrefix(vformat, "TIME8") == 0) { + format = EXTRACT_METADATA_FORMAT_TIME; + pattern = "[h+]:[m+]:[s+]"; + } else if (hasPrefix(vformat, "TIME11.2") == 0) { + format = EXTRACT_METADATA_FORMAT_TIME; + pattern = "[h+]:[m+]:[s+]"; + } else if (hasPrefix(vformat, "DTIME9") == 0) { + format = EXTRACT_METADATA_FORMAT_TIME; + pattern = "[d+] [h+]:[m+]"; + } else if (hasPrefix(vformat, "DTIME12") == 0) { + format = EXTRACT_METADATA_FORMAT_TIME; + pattern = "[d+] [h+]:[m+]:[s+]"; + } else if (hasPrefix(vformat, "DTIME15.2") == 0) { + format = EXTRACT_METADATA_FORMAT_TIME; + pattern = "[d+] [h+]:[m+]:[s+]"; + } else if (hasPrefix(vformat, "JDATE5") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "yd"; + } else if (hasPrefix(vformat, "JDATE7") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "yyyy+d"; + } else if (hasPrefix(vformat, "QYR6") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "q Q y"; + } else if (hasPrefix(vformat, "QYR8") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "q Q yyyy+"; + } else if (hasPrefix(vformat, "MOYR6") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "mmm yy"; + } else if (hasPrefix(vformat, "MOYR8") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "mmm yyyy"; + } else if (hasPrefix(vformat, "WKYR8") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "w WK y"; + } else if (hasPrefix(vformat, "WKYR10") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "w WK yyyy+"; + } else if (hasPrefix(vformat, "WKDAY3") == 0) { + // Day of the week, three letter abbreviation (e.g., "Mon"). + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "ddd"; + } else if (hasPrefix(vformat, "WKDAY9") == 0) { + // Day of the week, full name. + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "dddd+"; + } else if (hasPrefix(vformat, "MONTH3") == 0) { + // Three letter month abbreviation (e.g., "Feb"). + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "mmm"; + } else if (hasPrefix(vformat, "MONTH9") == 0) { + // Full month name. mmmmmm+ also matches this. + format = EXTRACT_METADATA_FORMAT_DATE; + pattern = "mmmm"; + } else { + format = EXTRACT_METADATA_FORMAT_NUMBER; + decimals = extract_decimals(vformat, 'F'); + } } else { - type = "NUMERIC"; - decimals = extract_decimals(format, 'F'); + format = EXTRACT_METADATA_FORMAT_UNSPECIFIED; } - } else { + break; + default: fprintf(stderr, "%s:%d unhandled type %s\n", __FILE__, __LINE__, readstat_type_str(variable->type)); exit(EXIT_FAILURE); + break; } if (ctx->count == 0) { @@ -93,7 +251,16 @@ static int handle_variable_sav(int index, readstat_variable_t *variable, const c fprintf(ctx->fp, ",\n"); } - fprintf(ctx->fp, "{\"type\": \"%s\", \"name\": \"%s\"", type, variable->name); + fprintf(ctx->fp, "{\"type\": \"%s\", \"name\": \"%s\"", + extract_metadata_type_str(type), + variable->name + ); + if (type == EXTRACT_METADATA_TYPE_NUMERIC) { + fprintf(ctx->fp, ", \"format\": \"%s\"", extract_metadata_format_str(format)); + if (pattern && pattern[0]) { + fprintf(ctx->fp, ", \"pattern\": \"%s\"", pattern); + } + } if (decimals > 0) { fprintf(ctx->fp, ", \"decimals\": %d", decimals); } @@ -105,29 +272,44 @@ static int handle_variable_sav(int index, readstat_variable_t *variable, const c add_val_labels(ctx, variable, val_labels); add_missing_values(ctx, variable); - + fprintf(ctx->fp, "}"); return 0; } static int handle_variable_dta(int index, readstat_variable_t *variable, const char *val_labels, struct context *ctx) { - char *type; - const char *format = readstat_variable_get_format(variable); + extract_metadata_type_t type = EXTRACT_METADATA_TYPE_UNKNOWN; + extract_metadata_format_t format = EXTRACT_METADATA_FORMAT_UNSPECIFIED; + char *pattern = ""; + + const char *vformat = readstat_variable_get_format(variable); const char *label = readstat_variable_get_label(variable); int decimals = -1; - if (readstat_variable_get_type_class(variable) == READSTAT_TYPE_CLASS_STRING) { - type = "STRING"; - } else if (readstat_variable_get_type_class(variable) == READSTAT_TYPE_CLASS_NUMERIC) { - if (format && strcmp(format, "%td") == 0) { - type = "DATE"; + switch (readstat_variable_get_type_class(variable)) { + case READSTAT_TYPE_CLASS_STRING: + type = EXTRACT_METADATA_TYPE_STRING; + break; + case READSTAT_TYPE_CLASS_NUMERIC: + type = EXTRACT_METADATA_TYPE_NUMERIC; + + // Extract format + // Pattern formats: https://developers.google.com/sheets/api/guides/formats + if (vformat) { + if (strcmp(vformat, "%d") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE; + } else if (strcmp(vformat, "%td") == 0) { + format = EXTRACT_METADATA_FORMAT_DATE_TIME; + } } else { - type = "NUMERIC"; - decimals = extract_decimals(format, '%'); + format = EXTRACT_METADATA_FORMAT_NUMBER; + decimals = extract_decimals(vformat, '%'); } - } else { + break; + default: fprintf(stderr, "%s:%d unhandled type %s\n", __FILE__, __LINE__, readstat_type_str(variable->type)); exit(EXIT_FAILURE); + break; } if (ctx->count == 0) { @@ -137,7 +319,16 @@ static int handle_variable_dta(int index, readstat_variable_t *variable, const c fprintf(ctx->fp, ",\n"); } - fprintf(ctx->fp, "{\"type\": \"%s\", \"name\": \"%s\"", type, variable->name); + fprintf(ctx->fp, "{\"type\": \"%s\", \"name\": \"%s\"", + extract_metadata_type_str(type), + variable->name + ); + if (type == EXTRACT_METADATA_TYPE_NUMERIC) { + fprintf(ctx->fp, ", \"format\": \"%s\"", extract_metadata_format_str(format)); + if (pattern && pattern[0]) { + fprintf(ctx->fp, ", \"pattern\": \"%s\"", pattern); + } + } if (decimals > 0) { fprintf(ctx->fp, ", \"decimals\": %d", decimals); } @@ -246,7 +437,7 @@ int pass(struct context *ctx, char *input, char *output, int pass) { } else if (pass == 2) { readstat_set_variable_handler(parser, &handle_variable); } - + const char *filename = input; size_t len = strlen(filename); From 2cb7160c8897f1ae26bd27ed482b35eb42b3ce95 Mon Sep 17 00:00:00 2001 From: Bastien Date: Sun, 20 Dec 2020 17:28:59 +0100 Subject: [PATCH 2/7] Uses UTS35 for date/time patterns --- src/bin/extract_metadata.c | 90 +++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 30 deletions(-) diff --git a/src/bin/extract_metadata.c b/src/bin/extract_metadata.c index 9d83a2a0..36647ce4 100644 --- a/src/bin/extract_metadata.c +++ b/src/bin/extract_metadata.c @@ -121,115 +121,145 @@ static int handle_variable_sav(int index, readstat_variable_t *variable, const c // Extract format // SPSS data types: https://libguides.library.kent.edu/SPSS/DatesTime - // Pattern formats: https://developers.google.com/sheets/api/guides/formats + // Pattern formats: https://unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns // TODO: Extract currency if (vformat) { if (hasPrefix(vformat, "DATE9") == 0) { + // e.g. 31-JAN-13 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "d-m-y"; + pattern = "dd-MMM-yy"; } else if (hasPrefix(vformat, "DATE11") == 0) { + // e.g. 31-JAN-13 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "dd-m-yyyy+"; + pattern = "dd-MMM-yyyy"; } else if (hasPrefix(vformat, "ADATE8") == 0) { + // e.g. 01/31/13 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "m/d/y"; + pattern = "MM/dd/yy"; } else if (hasPrefix(vformat, "ADATE10") == 0) { + // e.g. 01/31/2013 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "m/d/yyyy+"; + pattern = "MM/dd/yyyy"; } else if (hasPrefix(vformat, "EDATE8") == 0) { + // e.g. 31.01.13 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "d.m.y"; + pattern = "dd.MM.yy"; } else if (hasPrefix(vformat, "EDATE10") == 0) { + // e.g. 31.01.2013 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "d.m.yyyy+"; + pattern = "dd.MM.yyyy"; } else if (hasPrefix(vformat, "SDATE8") == 0) { + // e.g. 13/01/31 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "y/m/d"; + pattern = "yy/MM/dd"; } else if (hasPrefix(vformat, "SDATE10") == 0) { + // e.g. 2013/01/31 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "yyyy+/m/d"; + pattern = "yyyy/MM/dd"; } else if (hasPrefix(vformat, "DATETIME17") == 0) { + // e.g. 31-JAN-2013 01:02 format = EXTRACT_METADATA_FORMAT_DATE_TIME; - pattern = "d-mm-yyyy+ h:mm"; + pattern = "dd-MMM-yyyy hh:mm"; } else if (hasPrefix(vformat, "DATETIME20") == 0) { + // e.g. 31-JAN-2013 01:02:33 format = EXTRACT_METADATA_FORMAT_DATE_TIME; - pattern = "d-mm-yyyy+ h:mm:ss"; + pattern = "dd-MMM-yyyy hh:mm:ss"; } else if (hasPrefix(vformat, "DATETIME23.2") == 0) { + // e.g. 31-JAN-2013 01:02:33.72 format = EXTRACT_METADATA_FORMAT_DATE_TIME; - pattern = "d-mm-yyy+ h:mm:ss"; + pattern = "dd-MMM-yyyy hh:mm:ss.SS+"; } else if (hasPrefix(vformat, "YMDHMS16") == 0) { + // e.g. 2013-01-31 1:02 format = EXTRACT_METADATA_FORMAT_DATE_TIME; - pattern = "yyyy+-m-d h:mm"; + pattern = "yyyy-MM-dd h:mm"; } else if (hasPrefix(vformat, "YMDHMS19") == 0) { + // e.g. 2013-01-31 1:02:33 format = EXTRACT_METADATA_FORMAT_DATE_TIME; - pattern = "yyyy+-m-d h:mm:ss"; + pattern = "yyyy-MM-dd h:mm:ss"; } else if (hasPrefix(vformat, "YMDHMS19.2") == 0) { + // e.g. 2013-01-31 1:02:33.72 format = EXTRACT_METADATA_FORMAT_DATE_TIME; - pattern = "yyyy+-m-d h:mm:ss"; + pattern = "yyyy-MM-dd h:mm:ss.SS+"; } else if (hasPrefix(vformat, "MTIME5") == 0) { + // e.g. 1754:36 format = EXTRACT_METADATA_FORMAT_TIME; pattern = "[m+]:[s+]"; } else if (hasPrefix(vformat, "MTIME8.2") == 0) { + // e.g. 1754:36.58 format = EXTRACT_METADATA_FORMAT_TIME; pattern = "[m+]:[s+]"; } else if (hasPrefix(vformat, "TIME5") == 0) { + // e.g. 29:14 format = EXTRACT_METADATA_FORMAT_TIME; pattern = "[h+]:[m+]"; } else if (hasPrefix(vformat, "TIME8") == 0) { + // e.g. 29:14:36 format = EXTRACT_METADATA_FORMAT_TIME; pattern = "[h+]:[m+]:[s+]"; } else if (hasPrefix(vformat, "TIME11.2") == 0) { + // e.g. 29:14:36.58 format = EXTRACT_METADATA_FORMAT_TIME; pattern = "[h+]:[m+]:[s+]"; } else if (hasPrefix(vformat, "DTIME9") == 0) { + // e.g. 1 05:14 format = EXTRACT_METADATA_FORMAT_TIME; pattern = "[d+] [h+]:[m+]"; } else if (hasPrefix(vformat, "DTIME12") == 0) { + // e.g. 1 05:14:36 format = EXTRACT_METADATA_FORMAT_TIME; pattern = "[d+] [h+]:[m+]:[s+]"; } else if (hasPrefix(vformat, "DTIME15.2") == 0) { + // e.g. 1 05:14:36.58 format = EXTRACT_METADATA_FORMAT_TIME; pattern = "[d+] [h+]:[m+]:[s+]"; } else if (hasPrefix(vformat, "JDATE5") == 0) { + // e.g. 13031 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "yd"; + pattern = "yyddd"; } else if (hasPrefix(vformat, "JDATE7") == 0) { + // e.g. 2013031 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "yyyy+d"; + pattern = "yyyyddd"; } else if (hasPrefix(vformat, "QYR6") == 0) { + // e.g. 1 Q 13 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "q Q y"; + pattern = "Q 'Q' y"; } else if (hasPrefix(vformat, "QYR8") == 0) { + // e.g. 1 Q 2013 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "q Q yyyy+"; + pattern = "Q 'Q' yyyy"; } else if (hasPrefix(vformat, "MOYR6") == 0) { + // e.g. JAN 13 format = EXTRACT_METADATA_FORMAT_DATE; pattern = "mmm yy"; } else if (hasPrefix(vformat, "MOYR8") == 0) { + // e.g. JAN 2013 format = EXTRACT_METADATA_FORMAT_DATE; pattern = "mmm yyyy"; } else if (hasPrefix(vformat, "WKYR8") == 0) { + // e.g. 5 WK 13 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "w WK y"; + pattern = "w 'WK' yy"; } else if (hasPrefix(vformat, "WKYR10") == 0) { + // e.g. 5 WK 2013 format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "w WK yyyy+"; + pattern = "w 'WK' yyyy"; } else if (hasPrefix(vformat, "WKDAY3") == 0) { - // Day of the week, three letter abbreviation (e.g., "Mon"). + // Day of the week, three letter abbreviation (e.g. "Mon"). format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "ddd"; + pattern = "eee"; } else if (hasPrefix(vformat, "WKDAY9") == 0) { - // Day of the week, full name. + // Day of the week, full name. (e.g. "Monday") format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "dddd+"; + pattern = "eeee"; } else if (hasPrefix(vformat, "MONTH3") == 0) { - // Three letter month abbreviation (e.g., "Feb"). + // Three letter month abbreviation (e.g. "Feb"). format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "mmm"; + pattern = "MMM"; } else if (hasPrefix(vformat, "MONTH9") == 0) { - // Full month name. mmmmmm+ also matches this. + // Full month name. (e.g. "February") format = EXTRACT_METADATA_FORMAT_DATE; - pattern = "mmmm"; + pattern = "MMMM"; } else { format = EXTRACT_METADATA_FORMAT_NUMBER; decimals = extract_decimals(vformat, 'F'); @@ -294,7 +324,7 @@ static int handle_variable_dta(int index, readstat_variable_t *variable, const c type = EXTRACT_METADATA_TYPE_NUMERIC; // Extract format - // Pattern formats: https://developers.google.com/sheets/api/guides/formats + // Pattern formats: https://unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns if (vformat) { if (strcmp(vformat, "%d") == 0) { format = EXTRACT_METADATA_FORMAT_DATE; From f0e3ca75fef4b20d0e06b8d0ae2882d18007b53e Mon Sep 17 00:00:00 2001 From: Bastien Date: Sun, 20 Dec 2020 18:23:20 +0100 Subject: [PATCH 3/7] Implements column format in read_csv --- src/bin/extract_metadata.c | 16 ----------- src/bin/extract_metadata.h | 16 +++++++++++ src/bin/read_csv/json_metadata.c | 34 +++++++++++++++++----- src/bin/read_csv/json_metadata.h | 10 ++----- src/bin/read_csv/mod_csv.c | 39 +++++++++++++++++++++---- src/bin/read_csv/mod_dta.c | 49 ++++++++++++++++++++++++-------- src/bin/read_csv/mod_sav.c | 45 ++++++++++++++++++++++------- src/bin/read_csv/read_csv.c | 14 ++++----- 8 files changed, 158 insertions(+), 65 deletions(-) diff --git a/src/bin/extract_metadata.c b/src/bin/extract_metadata.c index 36647ce4..1e3bdf62 100644 --- a/src/bin/extract_metadata.c +++ b/src/bin/extract_metadata.c @@ -14,12 +14,6 @@ #include "write/json/write_missing_values.h" #include "write/json/write_value_labels.h" -typedef enum extract_metadata_type_e { - EXTRACT_METADATA_TYPE_NUMERIC, - EXTRACT_METADATA_TYPE_STRING, - EXTRACT_METADATA_TYPE_UNKNOWN -} extract_metadata_type_t; - static const char* extract_metadata_type_str(extract_metadata_type_t t) { switch (t) { case EXTRACT_METADATA_TYPE_NUMERIC: @@ -32,16 +26,6 @@ static const char* extract_metadata_type_str(extract_metadata_type_t t) { return "UNKNOWN"; } -typedef enum extract_metadata_format_e { - EXTRACT_METADATA_FORMAT_NUMBER, - EXTRACT_METADATA_FORMAT_PERCENT, - EXTRACT_METADATA_FORMAT_CURRENCY, - EXTRACT_METADATA_FORMAT_DATE, - EXTRACT_METADATA_FORMAT_TIME, - EXTRACT_METADATA_FORMAT_DATE_TIME, - EXTRACT_METADATA_FORMAT_UNSPECIFIED -} extract_metadata_format_t; - static const char* extract_metadata_format_str(extract_metadata_format_t format) { switch (format) { case EXTRACT_METADATA_FORMAT_NUMBER: diff --git a/src/bin/extract_metadata.h b/src/bin/extract_metadata.h index 240b611e..8287ac46 100644 --- a/src/bin/extract_metadata.h +++ b/src/bin/extract_metadata.h @@ -11,4 +11,20 @@ typedef struct context { readstat_label_set_t *label_set; } context; +typedef enum extract_metadata_type_e { + EXTRACT_METADATA_TYPE_NUMERIC, + EXTRACT_METADATA_TYPE_STRING, + EXTRACT_METADATA_TYPE_UNKNOWN +} extract_metadata_type_t; + +typedef enum extract_metadata_format_e { + EXTRACT_METADATA_FORMAT_NUMBER, + EXTRACT_METADATA_FORMAT_PERCENT, + EXTRACT_METADATA_FORMAT_CURRENCY, + EXTRACT_METADATA_FORMAT_DATE, + EXTRACT_METADATA_FORMAT_TIME, + EXTRACT_METADATA_FORMAT_DATE_TIME, + EXTRACT_METADATA_FORMAT_UNSPECIFIED +} extract_metadata_format_t; + #endif diff --git a/src/bin/read_csv/json_metadata.c b/src/bin/read_csv/json_metadata.c index 4ef3d4c3..17fab49d 100644 --- a/src/bin/read_csv/json_metadata.c +++ b/src/bin/read_csv/json_metadata.c @@ -104,7 +104,7 @@ char* copy_variable_property(struct json_metadata* md, const char* varname, cons if (tok == NULL) { return NULL; } - + int len = tok->end - tok->start; if (len == 0) { return NULL; @@ -186,7 +186,7 @@ int get_decimals(struct json_metadata* md, const char* varname) { } } -metadata_column_type_t column_type(struct json_metadata* md, const char* varname, int output_format) { +extract_metadata_type_t column_type(struct json_metadata* md, const char* varname, int output_format) { jsmntok_t* typ = find_variable_property(md->js, md->tok, varname, "type"); if (!typ) { fprintf(stderr, "Could not find type of variable %s in metadata\n", varname); @@ -194,17 +194,37 @@ metadata_column_type_t column_type(struct json_metadata* md, const char* varname } if (match_token(md->js, typ, "NUMERIC")) { - return METADATA_COLUMN_TYPE_NUMERIC; + return EXTRACT_METADATA_TYPE_NUMERIC; } else if (match_token(md->js, typ, "STRING")) { - return METADATA_COLUMN_TYPE_STRING; - } else if (match_token(md->js, typ, "DATE")) { - return METADATA_COLUMN_TYPE_DATE; + return EXTRACT_METADATA_TYPE_STRING; } else { fprintf(stderr, "%s: %d: Unknown metadata type for variable %s\n", __FILE__, __LINE__, varname); exit(EXIT_FAILURE); } } +extract_metadata_format_t column_format(struct json_metadata* md, const char* varname) { + jsmntok_t* typ = find_variable_property(md->js, md->tok, varname, "format"); + if (!typ) { + return EXTRACT_METADATA_FORMAT_UNSPECIFIED; + } + + if (match_token(md->js, typ, "NUMBER")) { + return EXTRACT_METADATA_FORMAT_NUMBER; + } else if (match_token(md->js, typ, "PERCENT")) { + return EXTRACT_METADATA_FORMAT_PERCENT; + } else if (match_token(md->js, typ, "CURRENCY")) { + return EXTRACT_METADATA_FORMAT_CURRENCY; + } else if (match_token(md->js, typ, "DATE")) { + return EXTRACT_METADATA_FORMAT_DATE; + } else if (match_token(md->js, typ, "TIME")) { + return EXTRACT_METADATA_FORMAT_TIME; + } else if (match_token(md->js, typ, "DATE_TIME")) { + return EXTRACT_METADATA_FORMAT_DATE_TIME; + } + return EXTRACT_METADATA_FORMAT_UNSPECIFIED; +} + double get_double_from_token(const char *js, jsmntok_t* token) { char buf[255]; char *dest; @@ -244,7 +264,7 @@ struct json_metadata* get_json_metadata(const char* filename) { fprintf(stderr, "malloc(): error:%s\n", strerror(errno)); goto errexit; } - + fd = fopen(filename, "rb"); if (fd == NULL) { fprintf(stderr, "Could not open %s: %s\n", filename, strerror(errno)); diff --git a/src/bin/read_csv/json_metadata.h b/src/bin/read_csv/json_metadata.h index 62bab06a..f93b8232 100644 --- a/src/bin/read_csv/json_metadata.h +++ b/src/bin/read_csv/json_metadata.h @@ -1,5 +1,6 @@ #include "jsmn.h" #include "../../readstat.h" +#include "../extract_metadata.h" #ifndef __JSON_METADATA_H_ #define __JSON_METADATA_H_ @@ -9,14 +10,9 @@ typedef struct json_metadata { jsmntok_t* tok; } json_metadata; -typedef enum metadata_column_type_e { - METADATA_COLUMN_TYPE_STRING, - METADATA_COLUMN_TYPE_NUMERIC, - METADATA_COLUMN_TYPE_DATE, -} metadata_column_type_t; - struct json_metadata* get_json_metadata(const char* filename); -metadata_column_type_t column_type(struct json_metadata* md, const char* varname, int output_format); +extract_metadata_type_t column_type(struct json_metadata* md, const char* varname, int output_format); +extract_metadata_format_t column_format(struct json_metadata* md, const char* varname); void free_json_metadata(struct json_metadata*); int get_decimals(struct json_metadata* md, const char* varname); diff --git a/src/bin/read_csv/mod_csv.c b/src/bin/read_csv/mod_csv.c index be5bf3cb..b1fef73b 100644 --- a/src/bin/read_csv/mod_csv.c +++ b/src/bin/read_csv/mod_csv.c @@ -1,6 +1,7 @@ #include #include "../../readstat.h" +#include "../extract_metadata.h" #include "json_metadata.h" #include "read_module.h" #include "csv_metadata.h" @@ -17,13 +18,39 @@ rs_read_module_t rs_read_mod_csv = { static void produce_column_header_csv(void *csv_metadata, const char *column, readstat_variable_t* var) { struct csv_metadata *c = (struct csv_metadata *)csv_metadata; - metadata_column_type_t coltype = column_type(c->json_md, column, c->output_format); - if (coltype == METADATA_COLUMN_TYPE_DATE) { - var->type = READSTAT_TYPE_STRING; - } else if (coltype == METADATA_COLUMN_TYPE_NUMERIC) { - var->type = READSTAT_TYPE_DOUBLE; - } else if (coltype == METADATA_COLUMN_TYPE_STRING) { + extract_metadata_type_t coltype = column_type(c->json_md, column, c->output_format); + switch (coltype) { + case EXTRACT_METADATA_TYPE_NUMERIC:; + extract_metadata_format_t colformat = column_format(c->json_md, column); + switch (colformat) { + case EXTRACT_METADATA_FORMAT_NUMBER: + var->type = READSTAT_TYPE_DOUBLE; + break; + case EXTRACT_METADATA_FORMAT_PERCENT: + var->type = READSTAT_TYPE_STRING; + break; + case EXTRACT_METADATA_FORMAT_CURRENCY: + var->type = READSTAT_TYPE_STRING; + break; + case EXTRACT_METADATA_FORMAT_DATE: + var->type = READSTAT_TYPE_STRING; + break; + case EXTRACT_METADATA_FORMAT_TIME: + var->type = READSTAT_TYPE_STRING; + break; + case EXTRACT_METADATA_FORMAT_DATE_TIME: + var->type = READSTAT_TYPE_STRING; + break; + default: + var->type = READSTAT_TYPE_DOUBLE; + } + break; + case EXTRACT_METADATA_TYPE_STRING: var->type = READSTAT_TYPE_STRING; + break; + case EXTRACT_METADATA_TYPE_UNKNOWN: + // ... + break; } } diff --git a/src/bin/read_csv/mod_dta.c b/src/bin/read_csv/mod_dta.c index 22b23c8d..58c4d7a2 100644 --- a/src/bin/read_csv/mod_dta.c +++ b/src/bin/read_csv/mod_dta.c @@ -147,7 +147,7 @@ static void produce_missingness_discrete_dta(struct csv_metadata *c, jsmntok_t* int j = 1; for (int i=0; isize; i++) { jsmntok_t* missing_value_token = values + j; - if (is_date) { + if (is_date) { dta_add_missing_date(var, get_dta_days_from_token(js, missing_value_token)); } else if (var->type == READSTAT_TYPE_DOUBLE) { dta_add_missing_double(var, get_double_from_token(js, missing_value_token)); @@ -166,7 +166,7 @@ void produce_missingness_dta(void *csv_metadata, const char* column) { const char *js = c->json_md->js; readstat_variable_t* var = &c->variables[c->columns]; var->missingness.missing_ranges_count = 0; - + jsmntok_t* missing = find_variable_property(js, c->json_md->tok, column, "missing"); if (!missing) { return; @@ -190,14 +190,39 @@ void produce_missingness_dta(void *csv_metadata, const char* column) { void produce_column_header_dta(void *csv_metadata, const char *column, readstat_variable_t* var) { struct csv_metadata *c = (struct csv_metadata *)csv_metadata; - metadata_column_type_t coltype = column_type(c->json_md, column, c->output_format); - if (coltype == METADATA_COLUMN_TYPE_DATE) { - snprintf(var->format, sizeof(var->format), "%s", "%td"); - var->type = READSTAT_TYPE_INT32; - } else if (coltype == METADATA_COLUMN_TYPE_NUMERIC) { - var->type = READSTAT_TYPE_DOUBLE; - snprintf(var->format, sizeof(var->format), "%%9.%df", get_decimals(c->json_md, column)); - } else if (coltype == METADATA_COLUMN_TYPE_STRING) { + extract_metadata_type_t coltype = column_type(c->json_md, column, c->output_format); + if (coltype == EXTRACT_METADATA_TYPE_NUMERIC) { + extract_metadata_format_t colformat = column_format(c->json_md, column); + switch (colformat) { + case EXTRACT_METADATA_FORMAT_NUMBER: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "%%9.%df", get_decimals(c->json_md, column)); + break; + case EXTRACT_METADATA_FORMAT_PERCENT: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "%%9.%df", get_decimals(c->json_md, column)); + break; + case EXTRACT_METADATA_FORMAT_CURRENCY: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "%%9.%df", get_decimals(c->json_md, column)); + break; + case EXTRACT_METADATA_FORMAT_DATE: + var->type = READSTAT_TYPE_INT32; + snprintf(var->format, sizeof(var->format), "%s", "%td"); + break; + case EXTRACT_METADATA_FORMAT_TIME: + var->type = READSTAT_TYPE_INT32; + snprintf(var->format, sizeof(var->format), "%s", "%td"); + break; + case EXTRACT_METADATA_FORMAT_DATE_TIME: + var->type = READSTAT_TYPE_INT32; + snprintf(var->format, sizeof(var->format), "%s", "%td"); + break; + default: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "%%9.%df", get_decimals(c->json_md, column)); + } + } else if (coltype == EXTRACT_METADATA_TYPE_STRING) { var->type = READSTAT_TYPE_STRING; } } @@ -214,7 +239,7 @@ static void produce_value_label_int32_date_dta(const char* column, struct csv_me .v = { .i32_value = days }, .type = READSTAT_TYPE_INT32, }; - + int missing_ranges_count = readstat_variable_get_missing_ranges_count(variable); for (int i=0; isize; i++) { jsmntok_t* missing_value_token = values + j; - if (is_date) { + if (is_date) { readstat_variable_add_missing_double_value(var, get_double_date_missing_sav(js, missing_value_token)); } else if (var->type == READSTAT_TYPE_DOUBLE) { readstat_variable_add_missing_double_value(var, get_double_from_token(js, missing_value_token)); @@ -101,7 +101,7 @@ void produce_missingness_sav(void *csv_metadata, const char* column) { const char *js = c->json_md->js; readstat_variable_t* var = &c->variables[c->columns]; var->missingness.missing_ranges_count = 0; - + jsmntok_t* missing = find_variable_property(js, c->json_md->tok, column, "missing"); if (!missing) { return; @@ -125,14 +125,39 @@ void produce_missingness_sav(void *csv_metadata, const char* column) { void produce_column_header_sav(void *csv_metadata, const char *column, readstat_variable_t* var) { struct csv_metadata *c = (struct csv_metadata *)csv_metadata; - metadata_column_type_t coltype = column_type(c->json_md, column, c->output_format); - if (coltype == METADATA_COLUMN_TYPE_DATE) { - var->type = READSTAT_TYPE_DOUBLE; - snprintf(var->format, sizeof(var->format), "%s", "EDATE40"); - } else if (coltype == METADATA_COLUMN_TYPE_NUMERIC) { - var->type = READSTAT_TYPE_DOUBLE; - snprintf(var->format, sizeof(var->format), "F8.%d", get_decimals(c->json_md, column)); - } else if (coltype == METADATA_COLUMN_TYPE_STRING) { + extract_metadata_type_t coltype = column_type(c->json_md, column, c->output_format); + if (coltype == EXTRACT_METADATA_TYPE_NUMERIC) { + extract_metadata_format_t colformat = column_format(c->json_md, column); + switch (colformat) { + case EXTRACT_METADATA_FORMAT_NUMBER: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "F8.%d", get_decimals(c->json_md, column)); + break; + case EXTRACT_METADATA_FORMAT_PERCENT: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "F8.%d", get_decimals(c->json_md, column)); + break; + case EXTRACT_METADATA_FORMAT_CURRENCY: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "F8.%d", get_decimals(c->json_md, column)); + break; + case EXTRACT_METADATA_FORMAT_DATE: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "%s", "EDATE40"); + break; + case EXTRACT_METADATA_FORMAT_TIME: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "%s", "EDATE40"); + break; + case EXTRACT_METADATA_FORMAT_DATE_TIME: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "%s", "EDATE40"); + break; + default: + var->type = READSTAT_TYPE_DOUBLE; + snprintf(var->format, sizeof(var->format), "F8.%d", get_decimals(c->json_md, column)); + } + } else if (coltype == EXTRACT_METADATA_TYPE_STRING) { var->type = READSTAT_TYPE_STRING; } } diff --git a/src/bin/read_csv/read_csv.c b/src/bin/read_csv/read_csv.c index 156e6028..afb253e7 100644 --- a/src/bin/read_csv/read_csv.c +++ b/src/bin/read_csv/read_csv.c @@ -32,12 +32,12 @@ static void produce_column_header(struct csv_metadata *c, void *s, size_t len) { char* column = (char*)s; readstat_variable_t* var = &c->variables[c->columns]; memset(var, 0, sizeof(readstat_variable_t)); - metadata_column_type_t coltype = column_type(c->json_md, column, c->output_format); + extract_metadata_type_t coltype = column_type(c->json_md, column, c->output_format); c->is_date[c->columns] = coltype == METADATA_COLUMN_TYPE_DATE; - if (coltype == METADATA_COLUMN_TYPE_STRING) { + if (coltype == EXTRACT_METADATA_TYPE_STRING) { var->alignment = READSTAT_ALIGNMENT_LEFT; - } else if (coltype == METADATA_COLUMN_TYPE_NUMERIC || coltype == METADATA_COLUMN_TYPE_DATE) { + } else if (coltype == EXTRACT_METADATA_TYPE_NUMERIC) { var->alignment = READSTAT_ALIGNMENT_RIGHT; } @@ -45,10 +45,10 @@ static void produce_column_header(struct csv_metadata *c, void *s, size_t len) { c->output_module->header(c, column, var); } - if (c->pass == 2 && coltype == METADATA_COLUMN_TYPE_STRING) { + if (c->pass == 2 && coltype == EXTRACT_METADATA_TYPE_STRING) { var->storage_width = c->column_width[c->columns]; } - + var->index = c->columns; copy_variable_property(c->json_md, column, "label", var->label, sizeof(var->label)); snprintf(var->name, sizeof(var->name), "%.*s", (int)len, column); @@ -99,7 +99,7 @@ static void csv_metadata_row(int cc, void *data) c->open_row = 0; } -readstat_error_t readstat_parse_csv(readstat_parser_t *parser, +readstat_error_t readstat_parse_csv(readstat_parser_t *parser, const char *path, struct csv_metadata* md, void *user_ctx) { readstat_error_t retval = READSTAT_OK; readstat_io_t *io = parser->io; @@ -147,7 +147,7 @@ readstat_error_t readstat_parse_csv(readstat_parser_t *parser, } unsigned char sep = get_separator(md->json_md); csv_set_delim(p, sep); - + while ((bytes_read = io->read(buf, sizeof(buf), io->io_ctx)) > 0) { if (csv_parse(p, buf, bytes_read, csv_metadata_cell, csv_metadata_row, md) != bytes_read) From 9e150ae5ec4ffe5b4de4c0adcba8d3e6080401cc Mon Sep 17 00:00:00 2001 From: Bastien Date: Wed, 23 Dec 2020 20:19:47 +0100 Subject: [PATCH 4/7] csv: Uses double for currency and percent --- src/bin/read_csv/mod_csv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bin/read_csv/mod_csv.c b/src/bin/read_csv/mod_csv.c index b1fef73b..887b878d 100644 --- a/src/bin/read_csv/mod_csv.c +++ b/src/bin/read_csv/mod_csv.c @@ -27,10 +27,10 @@ static void produce_column_header_csv(void *csv_metadata, const char *column, re var->type = READSTAT_TYPE_DOUBLE; break; case EXTRACT_METADATA_FORMAT_PERCENT: - var->type = READSTAT_TYPE_STRING; + var->type = READSTAT_TYPE_DOUBLE; break; case EXTRACT_METADATA_FORMAT_CURRENCY: - var->type = READSTAT_TYPE_STRING; + var->type = READSTAT_TYPE_DOUBLE; break; case EXTRACT_METADATA_FORMAT_DATE: var->type = READSTAT_TYPE_STRING; From 69dfaa2d143886addfc1a8aee99eb6eaeae95e9b Mon Sep 17 00:00:00 2001 From: Bastien Date: Wed, 23 Dec 2020 20:20:33 +0100 Subject: [PATCH 5/7] Changes dta output for date times to %tC --- src/bin/read_csv/mod_dta.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/bin/read_csv/mod_dta.c b/src/bin/read_csv/mod_dta.c index 58c4d7a2..cd5fe7ba 100644 --- a/src/bin/read_csv/mod_dta.c +++ b/src/bin/read_csv/mod_dta.c @@ -216,7 +216,8 @@ void produce_column_header_dta(void *csv_metadata, const char *column, readstat_ break; case EXTRACT_METADATA_FORMAT_DATE_TIME: var->type = READSTAT_TYPE_INT32; - snprintf(var->format, sizeof(var->format), "%s", "%td"); + snprintf(var->format, sizeof(var->format), "%s", "%tC"); + // %tC => is equivalent to coordinated universal time (UTC) break; default: var->type = READSTAT_TYPE_DOUBLE; From 6666b9af35090825627f9f90da808f8d02cfdc97 Mon Sep 17 00:00:00 2001 From: Bastien Date: Wed, 23 Dec 2020 20:20:54 +0100 Subject: [PATCH 6/7] Refactors switch statements --- src/bin/read_csv/mod_dta.c | 9 --------- src/bin/read_csv/mod_sav.c | 12 ------------ 2 files changed, 21 deletions(-) diff --git a/src/bin/read_csv/mod_dta.c b/src/bin/read_csv/mod_dta.c index cd5fe7ba..8ed7736a 100644 --- a/src/bin/read_csv/mod_dta.c +++ b/src/bin/read_csv/mod_dta.c @@ -195,13 +195,7 @@ void produce_column_header_dta(void *csv_metadata, const char *column, readstat_ extract_metadata_format_t colformat = column_format(c->json_md, column); switch (colformat) { case EXTRACT_METADATA_FORMAT_NUMBER: - var->type = READSTAT_TYPE_DOUBLE; - snprintf(var->format, sizeof(var->format), "%%9.%df", get_decimals(c->json_md, column)); - break; case EXTRACT_METADATA_FORMAT_PERCENT: - var->type = READSTAT_TYPE_DOUBLE; - snprintf(var->format, sizeof(var->format), "%%9.%df", get_decimals(c->json_md, column)); - break; case EXTRACT_METADATA_FORMAT_CURRENCY: var->type = READSTAT_TYPE_DOUBLE; snprintf(var->format, sizeof(var->format), "%%9.%df", get_decimals(c->json_md, column)); @@ -211,9 +205,6 @@ void produce_column_header_dta(void *csv_metadata, const char *column, readstat_ snprintf(var->format, sizeof(var->format), "%s", "%td"); break; case EXTRACT_METADATA_FORMAT_TIME: - var->type = READSTAT_TYPE_INT32; - snprintf(var->format, sizeof(var->format), "%s", "%td"); - break; case EXTRACT_METADATA_FORMAT_DATE_TIME: var->type = READSTAT_TYPE_INT32; snprintf(var->format, sizeof(var->format), "%s", "%tC"); diff --git a/src/bin/read_csv/mod_sav.c b/src/bin/read_csv/mod_sav.c index 76cb8ff3..481f4938 100644 --- a/src/bin/read_csv/mod_sav.c +++ b/src/bin/read_csv/mod_sav.c @@ -130,25 +130,13 @@ void produce_column_header_sav(void *csv_metadata, const char *column, readstat_ extract_metadata_format_t colformat = column_format(c->json_md, column); switch (colformat) { case EXTRACT_METADATA_FORMAT_NUMBER: - var->type = READSTAT_TYPE_DOUBLE; - snprintf(var->format, sizeof(var->format), "F8.%d", get_decimals(c->json_md, column)); - break; case EXTRACT_METADATA_FORMAT_PERCENT: - var->type = READSTAT_TYPE_DOUBLE; - snprintf(var->format, sizeof(var->format), "F8.%d", get_decimals(c->json_md, column)); - break; case EXTRACT_METADATA_FORMAT_CURRENCY: var->type = READSTAT_TYPE_DOUBLE; snprintf(var->format, sizeof(var->format), "F8.%d", get_decimals(c->json_md, column)); break; case EXTRACT_METADATA_FORMAT_DATE: - var->type = READSTAT_TYPE_DOUBLE; - snprintf(var->format, sizeof(var->format), "%s", "EDATE40"); - break; case EXTRACT_METADATA_FORMAT_TIME: - var->type = READSTAT_TYPE_DOUBLE; - snprintf(var->format, sizeof(var->format), "%s", "EDATE40"); - break; case EXTRACT_METADATA_FORMAT_DATE_TIME: var->type = READSTAT_TYPE_DOUBLE; snprintf(var->format, sizeof(var->format), "%s", "EDATE40"); From 554afddcd0707e85978a490703cb17a73329e5a0 Mon Sep 17 00:00:00 2001 From: Bastien Date: Mon, 4 Jan 2021 20:44:40 +0100 Subject: [PATCH 7/7] Updates json metadata schema --- variablemetadata_schema.json | 208 +++++++---------------------------- 1 file changed, 41 insertions(+), 167 deletions(-) diff --git a/variablemetadata_schema.json b/variablemetadata_schema.json index 7e9794b8..f4a43ddc 100644 --- a/variablemetadata_schema.json +++ b/variablemetadata_schema.json @@ -19,7 +19,9 @@ }, "separator": { "enum": [ - ",", ";", "\t" + ",", + ";", + "\t" ] }, "variables": { @@ -32,9 +34,6 @@ }, { "$ref": "#/definitions/SPSS-STRING" - }, - { - "$ref": "#/definitions/SPSS-DATE" } ] } @@ -59,6 +58,23 @@ "minLength": 0, "maxLength": 255 }, + "format": { + "type": { + "enum": [ + "NUMBER", + "PERCENT", + "CURRENCY", + "DATE", + "TIME", + "DATE_TIME", + "UNSPECIFIED" + ] + } + }, + "pattern": { + "type": "string", + "minLength": 1 + }, "decimals": { "type": "integer", "minimum": 0, @@ -152,61 +168,6 @@ ], "additionalProperties": false }, - "SPSS-DATE": { - "properties": { - "type": { - "enum": [ - "DATE" - ] - }, - "name": { - "type": "string", - "minLength": 1, - "maxLength": 64 - }, - "label": { - "type": "string", - "minLength": 0, - "maxLength": 255 - }, - "categories": { - "type": "array", - "items": { - "type": "object", - "properties": { - "code": { - "type": "string", - "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" - }, - "label": { - "type": "string" - } - }, - "required": [ - "code", - "label" - ], - "additionalProperties": false - } - }, - "missing": { - "type": "object", - "oneOf": [ - { - "$ref": "#/definitions/SPSS-DATE-DISCRETE" - }, - { - "$ref": "#/definitions/DATE-RANGE" - } - ] - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, "SPSS-NUMERIC-DISCRETE": { "type": "object", "properties": { @@ -255,30 +216,6 @@ ], "additionalProperties": false }, - "SPSS-DATE-DISCRETE": { - "type": "object", - "properties": { - "type": { - "enum": [ - "DISCRETE" - ] - }, - "values": { - "type": "array", - "items": { - "type": "string", - "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" - }, - "minItems": 1, - "maxItems": 3, - "uniqueItems": true - } - }, - "required": [ - "type", - "values" - ] - }, "NUMERIC-RANGE": { "type": "object", "properties": { @@ -342,7 +279,9 @@ }, "separator": { "enum": [ - ",", ";", "\t" + ",", + ";", + "\t" ] }, "variables": { @@ -354,9 +293,6 @@ "$ref": "#/definitions/STATA-NUMERIC" }, { - "$ref": "#/definitions/STATA-DATE" - }, - { "$ref": "#/definitions/STATA-STRING" } ] @@ -382,6 +318,23 @@ "minLength": 0, "maxLength": 255 }, + "format": { + "type": { + "enum": [ + "NUMBER", + "PERCENT", + "CURRENCY", + "DATE", + "TIME", + "DATE_TIME", + "UNSPECIFIED" + ] + } + }, + "pattern": { + "type": "string", + "minLength": 1 + }, "decimals": { "type": "integer", "minimum": 0, @@ -448,85 +401,6 @@ ], "additionalProperties": false }, - "STATA-DATE": { - "properties": { - "type": { - "enum": [ - "DATE" - ] - }, - "name": { - "type": "string", - "minLength": 1, - "maxLength": 32 - }, - "label": { - "type": "string", - "minLength": 0, - "maxLength": 255 - }, - "categories": { - "type": "array", - "items": { - "type": "object", - "properties": { - "code": { - "type": "string", - "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" - }, - "label": { - "type": "string" - } - }, - "required": [ - "code", - "label" - ], - "additionalProperties": false - } - }, - "missing": { - "type": "object", - "oneOf": [ - { - "$ref": "#/definitions/STATA-DATE-DISCRETE" - }, - { - "$ref": "#/definitions/DATE-RANGE" - } - ] - } - }, - "required": [ - "type", - "name" - ], - "additionalProperties": false - }, - "STATA-DATE-DISCRETE": { - "type": "object", - "properties": { - "type": { - "enum": [ - "DISCRETE" - ] - }, - "values": { - "type": "array", - "items": { - "type": "string", - "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" - }, - "minItems": 1, - "maxItems": 26, - "uniqueItems": true - } - }, - "required": [ - "type", - "values" - ] - }, "STATA-STRING": { "properties": { "type": { @@ -544,7 +418,7 @@ "minLength": 0, "maxLength": 255 }, - "categories": { + "categories": { "type": "array", "items": { "type": "object",