Skip to content

Commit

Permalink
added option --raw-input which sets that the input file can be XML fi…
Browse files Browse the repository at this point in the history
…le (e.g. fodt)
  • Loading branch information
shenek committed Nov 27, 2014
1 parent 4f8b74e commit 7f18c95
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 4 deletions.
3 changes: 3 additions & 0 deletions odt2txt.1
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ encoding will be used in automatic mode, use
\fB\-\-raw\fR
Print raw XML
.TP
\fB\-\-raw-input\fR
Input file is a raw XML (fodt, fods, ...)
.TP
\fB\-\-version\fR
Show version and copyright information
.SH COPYRIGHT
Expand Down
39 changes: 35 additions & 4 deletions odt2txt.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#define VERSION "0.5"

static int opt_raw;
static int opt_raw_input = 0;
static char *opt_encoding;
static int opt_width = 63;
static const char *opt_filename;
Expand Down Expand Up @@ -138,6 +139,7 @@ static void usage(void)
"Converts an OpenDocument or OpenOffice.org XML File to raw text.\n\n"
"Syntax: odt2txt [options] filename\n\n"
"Options: --raw Print raw XML\n"
" --raw-input Input file is a raw XML (fodt, fods, ...)\n"
#ifdef NO_ICONV
" --encoding=X Ignored. odt2txt has been built without iconv support.\n"
" Output will always be encoded in UTF-8\n"
Expand Down Expand Up @@ -435,11 +437,34 @@ static STRBUF *read_from_zip(const char *zipfile, const char *filename)
return content;
}

static void format_doc(STRBUF *buf)
static STRBUF *read_from_xml(const char *zipfile, const char *filename)
{

FILE *in = fopen(zipfile, "rb");
if (in == 0) {
fprintf(stderr, "Can't open %s.\n", filename);
exit(EXIT_FAILURE);
}

STRBUF *content = strbuf_new();

strbuf_append_file(content, in);

fclose(in);

return content;
}

static void format_doc(STRBUF *buf, int raw_input)
{
/* FIXME: Convert buffer to utf-8 first. Are there
OpenOffice texts which are not utf8-encoded? */

if (raw_input) {
RS_O(".*<office:body>", "<office:body>"); /* only body */
RS_G("<office:binary-data>[^>]*</office:binary-data>", ""); /* remove binary */
}

/* headline, first level */
RS_E("<text:h[^>]*outline-level=\"1\"[^>]*>([^<]*)<[^>]*>", &h1);
RS_E("<text:h[^>]*>([^<]*)<[^>]*>", &h2); /* other headlines */
Expand All @@ -451,7 +476,8 @@ static void format_doc(STRBUF *buf)
/* images */
RS_E("<draw:frame[^>]*draw:name=\"([^\"]*)\"[^>]*>", &image);

RS_G("<[^>]*>", ""); /* replace all remaining tags */

RS_G("<[^>]*>", ""); /* replace all remaining tags */
RS_G("\n +", "\n"); /* remove indentations, e.g. kword */
RS_G("\n{3,}", "\n\n"); /* remove large vertical spaces */

Expand Down Expand Up @@ -480,6 +506,9 @@ int main(int argc, const char **argv)
if (!strcmp(argv[i], "--raw")) {
opt_raw = 1;
i++; continue;
} else if (!strcmp(argv[i], "--raw-input")) {
opt_raw_input = 1;
i++; continue;
} else if (!strncmp(argv[i], "--encoding=", 11)) {
size_t arglen = strlen(argv[i]) - 10;
#ifdef iconvlist
Expand Down Expand Up @@ -563,11 +592,13 @@ int main(int argc, const char **argv)
}

/* read content.xml */
docbuf = read_from_zip(opt_filename, "content.xml");
docbuf = opt_raw_input ?
read_from_xml(opt_filename, "content.xml") :
read_from_zip(opt_filename, "content.xml");

if (!opt_raw) {
subst_doc(ic, docbuf);
format_doc(docbuf);
format_doc(docbuf, opt_raw_input);
}

wbuf = wrap(docbuf, opt_width);
Expand Down
40 changes: 40 additions & 0 deletions strbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,46 @@ int strbuf_subst(STRBUF *buf,
return diff;
}

size_t strbuf_append_file(STRBUF *buf, FILE *in)
{
strbuf_check(buf);

/* save NULLOK flag */
int nullok = (buf->opt & STRBUF_NULLOK) ? 1 : 0;
strbuf_setopt(buf, STRBUF_NULLOK);


size_t len = 0;
size_t read_len = 0;
char readbuf[1024];
do {
read_len = fread(readbuf, 1, sizeof(readbuf), in);
len += read_len;

if (read_len > 0) {
while (buf->buf_sz < buf->len + sizeof(readbuf))
strbuf_grow(buf);

memcpy(buf->data + buf->len, readbuf, read_len);
buf->len += read_len;
}

} while (read_len == sizeof(readbuf));

/* terminate buffer */
if (buf->len + 1 > buf->buf_sz)
strbuf_grow(buf);
*(buf->data + buf->len) = '\0';

/* restore NULLOK option */
if (!nullok)
strbuf_unsetopt(buf, STRBUF_NULLOK);

strbuf_check(buf);

return len;
}

size_t strbuf_append_inflate(STRBUF *buf, FILE *in)
{
size_t len;
Expand Down
6 changes: 6 additions & 0 deletions strbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ size_t strbuf_append(STRBUF *buf, const char *str);
*/
size_t strbuf_append_inflate(STRBUF *buf, FILE *in);

/*
* Reads a data stream from in and appends it to the buffer out.
* Returns the number of appended characters.
*/
size_t strbuf_append_file(STRBUF *buf, FILE *in);

/*
* Returns a pointer to the contained string.
*/
Expand Down

0 comments on commit 7f18c95

Please sign in to comment.