From e4957188734d8d977f3b494983e28156af9a258e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Sun, 2 Jan 2022 01:49:20 -1000 Subject: [PATCH] bgzip text compression mode Co-authored-by: Mike Lin Compressing text now promotes alignment of BGZF blocks with the uncompressed text lines. BGZF blocks start at the beginning of an input line and end after some subsequent newline (except when the block's first line overflows the BGZF block size). This ensures it's possible to specify byte ranges of a BGZF file that decompress into complete text records -- useful for parallel processing and "slicing" from remote servers. To disable this feature and provide a way to produce identical output with 1.15 and earlier, the --binary option forces text data to be processed as if it were binary. The idea and initial implementation was Mike Lin's, with the current revised implementation by James Bonfield. --- Makefile | 2 +- bgzip.1 | 11 +++++ bgzip.c | 130 +++++++++++++++++++++++++++++++++++++++++++-------- test/test.pl | 24 ++++++++++ 4 files changed, 147 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 374141898..827d93db7 100644 --- a/Makefile +++ b/Makefile @@ -494,7 +494,7 @@ htsfile: htsfile.o libhts.a tabix: tabix.o libhts.a $(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread -bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) +bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h) htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) diff --git a/bgzip.1 b/bgzip.1 index 30c2808e5..07f13fa41 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -74,6 +74,17 @@ after decompression completes the input file will be removed. .SH OPTIONS .TP 10 +.B "--binary" +Bgzip will attempt to ensure BGZF blocks end on a newline when the +input is a text file. The exception to this is where a single line is +larger than a BGZF block (64Kb). This can aid tools that use the +index to perform random access on the compressed stream, as the start +of a block is likely to also be the start of a text record. + +This option processes text files as if they were binary content, +ignoring the location of newlines. This also restores the behaviour +for text files to bgzip version 1.15 and earlier. +.TP .BI "-b, --offset " INT Decompress to standard output from virtual file position (0-based uncompressed offset). diff --git a/bgzip.c b/bgzip.c index bd0374811..4516eadec 100644 --- a/bgzip.c +++ b/bgzip.c @@ -36,13 +36,14 @@ #include #include "htslib/bgzf.h" #include "htslib/hts.h" +#include "htslib/hfile.h" #ifdef _WIN32 # define WIN32_LEAN_AND_MEAN # include #endif -static const int WINDOW_SIZE = 64 * 1024; +static const int WINDOW_SIZE = BGZF_BLOCK_SIZE; static void error(const char *format, ...) { @@ -121,15 +122,16 @@ static int bgzip_main_usage(FILE *fp, int status) fprintf(fp, " -r, --reindex (re)index compressed file\n"); fprintf(fp, " -s, --size INT decompress INT bytes (uncompressed size)\n"); fprintf(fp, " -t, --test test integrity of compressed file\n"); + fprintf(fp, " --binary Don't align blocks with text lines\n"); fprintf(fp, " -@, --threads INT number of compression threads to use [1]\n"); return status; } int main(int argc, char **argv) { - int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep; + int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep, binary; BGZF *fp; - void *buffer; + char *buffer; long start, end, size; char *index_fname = NULL; int threads = 1; @@ -151,10 +153,11 @@ int main(int argc, char **argv) {"test", no_argument, NULL, 't'}, {"version", no_argument, NULL, 1}, {"keep", no_argument, NULL, 'k'}, + {"binary", no_argument, NULL, 2}, {NULL, 0, NULL, 0} }; - compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; binary = 0; while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:grtk",loptions,NULL)) >= 0){ switch(c){ case 'd': compress = 0; break; @@ -175,6 +178,7 @@ int main(int argc, char **argv) "bgzip (htslib) %s\n" "Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; + case 2: binary = 1; break; case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS); case '?': return bgzip_main_usage(stderr, EXIT_FAILURE); } @@ -185,7 +189,7 @@ int main(int argc, char **argv) return 1; } if (compress == 1) { - int f_src = fileno(stdin); + hFILE* f_src = NULL; char out_mode[3] = "w\0"; char out_mode_exclusive[4] = "wx\0"; @@ -198,13 +202,13 @@ int main(int argc, char **argv) out_mode_exclusive[2] = compress_level + '0'; } + if (!(f_src = hopen(argc > optind ? argv[optind] : "-", "r"))) { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + if ( argc>optind ) { - if ((f_src = open(argv[optind], O_RDONLY)) < 0) { - fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); - return 1; - } - if (pstdout) fp = bgzf_open("-", out_mode); else @@ -250,18 +254,103 @@ int main(int argc, char **argv) bgzf_mt(fp, threads, 256); buffer = malloc(WINDOW_SIZE); -#ifdef _WIN32 - _setmode(f_src, O_BINARY); -#endif + if (!buffer) + return 1; if (rebgzip){ if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); - while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) + while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); } else { - while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) - if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); + htsFormat fmt; + int textual = 0; + if (!binary + && hts_detect_format(f_src, &fmt) == 0 + && fmt.compression == no_compression) { + switch(fmt.format) { + case text_format: + case sam: + case vcf: + case bed: + case fasta_format: + case fastq_format: + case fai_format: + case fqi_format: + textual = 1; + break; + default: break; // silence clang warnings + } + } + + if (binary || !textual) { + // Binary data, either detected or explicit + while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) + error("Could not write %d bytes: Error %d\n", + c, fp->errcode); + } else { + /* Text mode, try a flush after a newline */ + int in_header = 1, n = 0, long_line = 0; + while ((c = hread(f_src, buffer+n, WINDOW_SIZE-n)) > 0) { + int c2 = c+n; + int flush = 0; + if (in_header && + (long_line || buffer[0] == '@' || buffer[0] == '#')) { + // Scan forward to find the last header line. + int last_start = 0; + n = 0; + while (n < c2) { + if (buffer[n++] != '\n') + continue; + + last_start = n; + if (n < c2 && + !(buffer[n] == '@' || buffer[n] == '#')) { + in_header = 0; + break; + } + } + if (!last_start) { + n = c2; + long_line = 1; + } else { + n = last_start; + flush = 1; + long_line = 0; + } + } else { + // Scan backwards to find the last newline. + n += c; // c read plus previous n overflow + while (--n >= 0 && ((char *)buffer)[n] != '\n') + ; + + if (n >= 0) { + flush = 1; + n++; + } else { + n = c2; + } + } + + // Pos n is either at the end of the buffer with flush==0, + // or the first byte after a newline and a flush point. + if (bgzf_write(fp, buffer, n) < 0) + error("Could not write %d bytes: Error %d\n", + n, fp->errcode); + if (flush) + if (bgzf_flush_try(fp, 65536) < 0) // force + return -1; + + memmove(buffer, buffer+n, c2-n); + n = c2-n; + } + + // Trailing data. + if (bgzf_write(fp, buffer, n) < 0) + error("Could not write %d bytes: Error %d\n", + n, fp->errcode); + } } if ( index ) { @@ -270,13 +359,16 @@ int main(int argc, char **argv) error("Could not write index to '%s'\n", index_fname); } else { if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0) - error("Could not write index to '%s.gz.gzi'", argv[optind]); + error("Could not write index to '%s.gz.gzi'\n", + argv[optind]); } } - if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode); + if (bgzf_close(fp) < 0) + error("Output close failed: Error %d\n", fp->errcode); + if (hclose(f_src) < 0) + error("Input close failed\n"); if (argc > optind && !pstdout && !keep) unlink(argv[optind]); free(buffer); - close(f_src); return 0; } else if ( reindex ) diff --git a/test/test.pl b/test/test.pl index d6c01786a..a529cfd1a 100755 --- a/test/test.pl +++ b/test/test.pl @@ -398,6 +398,30 @@ sub test_bgzip { } passed($opts,$test); + # Round-trip test of text in binary mode + my $test = sprintf('%s %2s threads', 'bgzip text mode round-trip', + $threads ? $threads : 'no'); + print "$test: "; + my $c = "$$opts{bin}/bgzip $at --binary -i -I '$index' < '$data' > '$compressed'"; + my ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "$$opts{bin}/bgzip $at -d < '$compressed' > '$uncompressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "cmp '$data' '$uncompressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, $out ? $out : "'$data' '$uncompressed' differ"); + return; + } + passed($opts,$test); + # Extract from an offset $test = sprintf('%s %2s threads', 'bgzip -b', $threads ? $threads : 'no');