From 7f18c9544d1bc707983005708cfa31ac169399ad Mon Sep 17 00:00:00 2001 From: Stepan Henek Date: Thu, 27 Nov 2014 13:04:32 +0100 Subject: [PATCH] added option --raw-input which sets that the input file can be XML file (e.g. fodt) --- odt2txt.1 | 3 +++ odt2txt.c | 39 +++++++++++++++++++++++++++++++++++---- strbuf.c | 40 ++++++++++++++++++++++++++++++++++++++++ strbuf.h | 6 ++++++ 4 files changed, 84 insertions(+), 4 deletions(-) diff --git a/odt2txt.1 b/odt2txt.1 index 88caf12..9b1e5da 100644 --- a/odt2txt.1 +++ b/odt2txt.1 @@ -53,6 +53,9 @@ encoding will be used in automatic mode, use \fB\-\-raw\fR Print raw XML .TP +\fB\-\-raw-input\fR +Input file is a raw XML (fodt, fods, ...) +.TP \fB\-\-version\fR Show version and copyright information .SH COPYRIGHT diff --git a/odt2txt.c b/odt2txt.c index 3480749..9b29fb7 100644 --- a/odt2txt.c +++ b/odt2txt.c @@ -45,6 +45,7 @@ #define VERSION "0.5" static int opt_raw; +static int opt_raw_input = 0; static char *opt_encoding; static int opt_width = 63; static const char *opt_filename; @@ -138,6 +139,7 @@ static void usage(void) "Converts an OpenDocument or OpenOffice.org XML File to raw text.\n\n" "Syntax: odt2txt [options] filename\n\n" "Options: --raw Print raw XML\n" + " --raw-input Input file is a raw XML (fodt, fods, ...)\n" #ifdef NO_ICONV " --encoding=X Ignored. odt2txt has been built without iconv support.\n" " Output will always be encoded in UTF-8\n" @@ -435,11 +437,34 @@ static STRBUF *read_from_zip(const char *zipfile, const char *filename) return content; } -static void format_doc(STRBUF *buf) +static STRBUF *read_from_xml(const char *zipfile, const char *filename) +{ + + FILE *in = fopen(zipfile, "rb"); + if (in == 0) { + fprintf(stderr, "Can't open %s.\n", filename); + exit(EXIT_FAILURE); + } + + STRBUF *content = strbuf_new(); + + strbuf_append_file(content, in); + + fclose(in); + + return content; +} + +static void format_doc(STRBUF *buf, int raw_input) { /* FIXME: Convert buffer to utf-8 first. Are there OpenOffice texts which are not utf8-encoded? */ + if (raw_input) { + RS_O(".*", ""); /* only body */ + RS_G("[^>]*", ""); /* remove binary */ + } + /* headline, first level */ RS_E("]*outline-level=\"1\"[^>]*>([^<]*)<[^>]*>", &h1); RS_E("]*>([^<]*)<[^>]*>", &h2); /* other headlines */ @@ -451,7 +476,8 @@ static void format_doc(STRBUF *buf) /* images */ RS_E("]*draw:name=\"([^\"]*)\"[^>]*>", &image); - RS_G("<[^>]*>", ""); /* replace all remaining tags */ + + RS_G("<[^>]*>", ""); /* replace all remaining tags */ RS_G("\n +", "\n"); /* remove indentations, e.g. kword */ RS_G("\n{3,}", "\n\n"); /* remove large vertical spaces */ @@ -480,6 +506,9 @@ int main(int argc, const char **argv) if (!strcmp(argv[i], "--raw")) { opt_raw = 1; i++; continue; + } else if (!strcmp(argv[i], "--raw-input")) { + opt_raw_input = 1; + i++; continue; } else if (!strncmp(argv[i], "--encoding=", 11)) { size_t arglen = strlen(argv[i]) - 10; #ifdef iconvlist @@ -563,11 +592,13 @@ int main(int argc, const char **argv) } /* read content.xml */ - docbuf = read_from_zip(opt_filename, "content.xml"); + docbuf = opt_raw_input ? + read_from_xml(opt_filename, "content.xml") : + read_from_zip(opt_filename, "content.xml"); if (!opt_raw) { subst_doc(ic, docbuf); - format_doc(docbuf); + format_doc(docbuf, opt_raw_input); } wbuf = wrap(docbuf, opt_width); diff --git a/strbuf.c b/strbuf.c index 22b32c0..64985f9 100644 --- a/strbuf.c +++ b/strbuf.c @@ -156,6 +156,46 @@ int strbuf_subst(STRBUF *buf, return diff; } +size_t strbuf_append_file(STRBUF *buf, FILE *in) +{ + strbuf_check(buf); + + /* save NULLOK flag */ + int nullok = (buf->opt & STRBUF_NULLOK) ? 1 : 0; + strbuf_setopt(buf, STRBUF_NULLOK); + + + size_t len = 0; + size_t read_len = 0; + char readbuf[1024]; + do { + read_len = fread(readbuf, 1, sizeof(readbuf), in); + len += read_len; + + if (read_len > 0) { + while (buf->buf_sz < buf->len + sizeof(readbuf)) + strbuf_grow(buf); + + memcpy(buf->data + buf->len, readbuf, read_len); + buf->len += read_len; + } + + } while (read_len == sizeof(readbuf)); + + /* terminate buffer */ + if (buf->len + 1 > buf->buf_sz) + strbuf_grow(buf); + *(buf->data + buf->len) = '\0'; + + /* restore NULLOK option */ + if (!nullok) + strbuf_unsetopt(buf, STRBUF_NULLOK); + + strbuf_check(buf); + + return len; +} + size_t strbuf_append_inflate(STRBUF *buf, FILE *in) { size_t len; diff --git a/strbuf.h b/strbuf.h index 4e7500b..22f5c58 100644 --- a/strbuf.h +++ b/strbuf.h @@ -55,6 +55,12 @@ size_t strbuf_append(STRBUF *buf, const char *str); */ size_t strbuf_append_inflate(STRBUF *buf, FILE *in); +/* + * Reads a data stream from in and appends it to the buffer out. + * Returns the number of appended characters. + */ +size_t strbuf_append_file(STRBUF *buf, FILE *in); + /* * Returns a pointer to the contained string. */