Skip to content

Commit

Permalink
create --sparse, file map support for the "fixed" chunker, see #14
Browse files Browse the repository at this point in the history
a file map can be:

- created internally inside chunkify by calling sparsemap, which uses
  SEEK_DATA / SEEK_HOLE to determine data and hole ranges inside a
  seekable sparse file.
  Usage: borg create --sparse --chunker-params=fixed,BLOCKSIZE ...
  BLOCKSIZE is the chunker blocksize here, not the filesystem blocksize!

- made by some other means and given to the chunkify function.
  this is not used yet, but in future this could be used to only read
  the changed parts and seek over the (known) unchanged parts of a file.

sparsemap: the generate range sizes are multiples of the fs block size.
           the tests assume 4kiB fs block size.
  • Loading branch information
ThomasWaldmann committed Dec 12, 2020
1 parent c2118f1 commit a21ea5d
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 42 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,5 +297,5 @@ def members_appended(*ds):
setup_requires=['setuptools_scm>=1.7'],
install_requires=install_requires,
extras_require=extras_require,
python_requires='>=3.6',
python_requires='>=3.5',
)
4 changes: 2 additions & 2 deletions src/borg/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -1167,7 +1167,7 @@ class FilesystemObjectProcessors:

def __init__(self, *, metadata_collector, cache, key,
add_item, process_file_chunks,
chunker_params, show_progress):
chunker_params, show_progress, sparse):
self.metadata_collector = metadata_collector
self.cache = cache
self.key = key
Expand All @@ -1178,7 +1178,7 @@ def __init__(self, *, metadata_collector, cache, key,
self.hard_links = {}
self.stats = Statistics() # threading: done by cache (including progress)
self.cwd = os.getcwd()
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)

@contextmanager
def create_helper(self, path, st, status=None, hardlinkable=True):
Expand Down
4 changes: 3 additions & 1 deletion src/borg/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ def create_inner(archive, cache, fso):
checkpoint_interval=args.checkpoint_interval, rechunkify=False)
fso = FilesystemObjectProcessors(metadata_collector=metadata_collector, cache=cache, key=key,
process_file_chunks=cp.process_file_chunks, add_item=archive.add_item,
chunker_params=args.chunker_params, show_progress=args.progress)
chunker_params=args.chunker_params, show_progress=args.progress, sparse=args.sparse)
create_inner(archive, cache, fso)
else:
create_inner(None, None, None)
Expand Down Expand Up @@ -3341,6 +3341,8 @@ def define_borg_mount(parser):
help='deprecated, use ``--noflags`` instead')
fs_group.add_argument('--noflags', dest='noflags', action='store_true',
help='do not read and store flags (e.g. NODUMP, IMMUTABLE) into archive')
fs_group.add_argument('--sparse', dest='sparse', action='store_true',
help='detect sparse holes in input (supported only by fixed chunker)')
fs_group.add_argument('--files-cache', metavar='MODE', dest='files_cache_mode',
type=FilesCacheMode, default=DEFAULT_FILES_CACHE_MODE_UI,
help='operate files cache in MODE. default: %s' % DEFAULT_FILES_CACHE_MODE_UI)
Expand Down
172 changes: 134 additions & 38 deletions src/borg/chunker.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

API_VERSION = '1.2_01'

import errno
import os

from libc.stdlib cimport free
Expand All @@ -19,11 +20,85 @@ cdef extern from "_chunker.c":
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)


def dread(offset, size, fd=None, fh=-1):
use_fh = fh >= 0
if use_fh:
data = os.read(fh, size)
if hasattr(os, 'posix_fadvise'):
# UNIX only and, in case of block sizes that are not a multiple of the
# system's page size, better be used with a bug fixed linux kernel > 4.6.0,
# see comment/workaround in _chunker.c and borgbackup issue #907.
os.posix_fadvise(fh, offset, len(data), os.POSIX_FADV_DONTNEED)
return data
else:
return fd.read(size)


def dseek(amount, whence, fd=None, fh=-1):
use_fh = fh >= 0
if use_fh:
return os.lseek(fh, amount, whence)
else:
return fd.seek(amount, whence)


def dpos_curr_end(fd=None, fh=-1):
# return current position, file end position (== file length)
curr = dseek(0, os.SEEK_CUR, fd, fh)
end = dseek(0, os.SEEK_END, fd, fh)
dseek(curr, os.SEEK_SET, fd, fh)
return curr, end


def sparsemap(fd=None, fh=-1):
"""
generator yielding (start, length, type) tuples,
indicating data (True) and hole (False) ranges inside the file.
note:
the map is generated starting from the current seek position (it
is not required to be 0 / to be at the start of the file) and
work from there up to the end of the file.
when the generator is finished, the file pointer position will be
reset to where it was before calling this function.
"""
curr, file_len = dpos_curr_end(fd, fh) # start is the CURRENT position now.
start = curr
try:
whence = os.SEEK_HOLE
while True:
is_data = whence == os.SEEK_HOLE # True: range with data, False: range is a hole
try:
end = dseek(start, whence, fd, fh)
except OSError as e:
if e.errno == errno.ENXIO:
if not is_data and start < file_len:
# if there is only sparse space at the end of a file, we can not
# find the file end by SEEK_DATA (because run into ENXIO), thus
# we must manually deal with this case:
end = file_len
yield (start, end - start, is_data)
break
else:
raise
# we do not want to yield zero-length ranges with start == end:
if end > start:
yield (start, end - start, is_data)
start = end
whence = os.SEEK_DATA if is_data else os.SEEK_HOLE
finally:
# seek to same position as before calling this function
dseek(curr, os.SEEK_SET, fd, fh)


class ChunkerFixed:
"""
Fixed blocksize Chunker, optionally supporting a header block of different size.
Fixed blocksize Chunker, optionally supporting:
This is a very simple chunker for input data with known block/record sizes:
- a header block of different size
- using a sparsemap to only read ranges with data
This is a simple chunker for input data with known block/record sizes:
- raw disk images
- block devices
Expand All @@ -32,52 +107,72 @@ class ChunkerFixed:
Note: the last block of the input data may be less than the block size,
this is supported and not considered to be an error.
"""
def __init__(self, block_size, header_size=0):
def __init__(self, block_size, header_size=0, sparse=False):
self.block_size = block_size
self.header_size = header_size
# should borg try to do sparse input processing?
# whether it actually can be done depends on the input file being seekable.
self.try_sparse = sparse and hasattr(os, 'SEEK_DATA') and hasattr(os, 'SEEK_HOLE')
self.zeros = memoryview(bytes(block_size)) if self.try_sparse else None

def chunkify(self, fd, fh=-1):
def chunkify(self, fd=None, fh=-1, fmap=None):
"""
Cut a file into chunks.
:param fd: Python file object
:param fh: OS-level file handle (if available),
defaults to -1 which means not to use OS-level fd.
"""
if fmap is None:
if self.try_sparse:
try:
if self.header_size > 0:
header_map = [(0, self.header_size, True), ]
dseek(self.header_size, os.SEEK_SET, fd, fh)
body_map = list(sparsemap(fd, fh))
dseek(0, os.SEEK_SET, fd, fh)
else:
header_map = []
body_map = list(sparsemap(fd, fh))
except OSError as err:
# seeking in sparsemap did not work
pass
else:
fmap = header_map + body_map

if fmap is None:
# either sparse processing (building the fmap) was not tried or it failed.
# in these cases, we just build a "fake fmap" that considers the whole file
# as range(s) of data (no holes), so we can use the same code.
# we build different fmaps here for the purpose of correct block alignment
# with or without a header block (of potentially different size).
if self.header_size > 0:
header_map = [(0, self.header_size, True), ]
body_map = [(self.header_size, 2 ** 62, True), ]
else:
header_map = []
body_map = [(0, 2 ** 62, True), ]
fmap = header_map + body_map

offset = 0
use_fh = fh >= 0

if use_fh:
def read(size):
nonlocal offset
data = os.read(fh, size)
amount = len(data)
if hasattr(os, 'posix_fadvise'):
# UNIX only and, in case of block sizes that are not a multiple of the
# system's page size, better be used with a bug fixed linux kernel > 4.6.0,
# see comment/workaround in _chunker.c and borgbackup issue #907.
os.posix_fadvise(fh, offset, amount, os.POSIX_FADV_DONTNEED)
offset += amount
return data
else:
def read(size):
nonlocal offset
data = fd.read(size)
amount = len(data)
offset += amount
return data

if self.header_size > 0:
data = read(self.header_size)
if data:
yield data
else:
data = True # get into next while loop
while data:
data = read(self.block_size)
if data:
yield data
# empty data means we are at EOF and we terminate the generator.
for range_start, range_size, is_data in fmap:
while range_size:
wanted = min(range_size, self.block_size)
if is_data:
# read blocks from the range with the desired read_size, if possible
data = dread(offset, wanted, fd, fh)
else: # hole
# seek over blocks from the range with the desired read_size, if possible
pos = dseek(wanted, os.SEEK_CUR, fd, fh)
data = self.zeros[:pos - offset] # for now, create zero-bytes here
got = len(data)
if got > 0:
offset += got
range_size -= got
yield data # later, use a better api that tags data vs. sparse
if got < wanted:
# we did not get enough data, looks like early EOF.
return


cdef class Chunker:
Expand Down Expand Up @@ -129,7 +224,8 @@ def get_chunker(algo, *params, **kw):
seed = kw['seed']
return Chunker(seed, *params)
if algo == 'fixed':
return ChunkerFixed(*params)
sparse = kw['sparse']
return ChunkerFixed(*params, sparse=sparse)
raise TypeError('unsupported chunker algo %r' % algo)


Expand Down
120 changes: 120 additions & 0 deletions src/borg/testsuite/chunker_pytest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from io import BytesIO
import os

import pytest

from ..chunker import ChunkerFixed, sparsemap
from ..constants import * # NOQA

BS = 4096 # fs block size

# some sparse files. X = content blocks, _ = sparse blocks.
# X__XXX____
map_sparse1 = [
(0 * BS, 1 * BS, True),
(1 * BS, 2 * BS, False),
(3 * BS, 3 * BS, True),
(6 * BS, 4 * BS, False),
]

# _XX___XXXX
map_sparse2 = [
(0 * BS, 1 * BS, False),
(1 * BS, 2 * BS, True),
(3 * BS, 3 * BS, False),
(6 * BS, 4 * BS, True),
]

# XXX
map_notsparse = [(0 * BS, 3 * BS, True), ]

# ___
map_onlysparse = [(0 * BS, 3 * BS, False), ]


def make_sparsefile(fname, sparsemap, header_size=0):
with open(fname, 'wb') as fd:
total = 0
if header_size:
fd.write(b'H' * header_size)
total += header_size
for offset, size, is_data in sparsemap:
if is_data:
fd.write(b'X' * size)
else:
fd.seek(size, os.SEEK_CUR)
total += size
fd.truncate(total)
assert os.path.getsize(fname) == total


def make_content(sparsemap, header_size=0):
with BytesIO() as fd:
total = 0
if header_size:
fd.write(b'H' * header_size)
total += header_size
for offset, size, is_data in sparsemap:
if is_data:
fd.write(b'X' * size)
else:
fd.write(b'\0' * size)
total += size
content = fd.getvalue()
assert len(content) == total
return content


@pytest.mark.parametrize("fname, sparse_map", [
('sparse1', map_sparse1),
('sparse2', map_sparse2),
('onlysparse', map_onlysparse),
('notsparse', map_notsparse),
])
def test_sparsemap(tmpdir, fname, sparse_map):

def get_sparsemap_fh(fname):
fh = os.open(fname, flags=os.O_RDONLY)
try:
return list(sparsemap(fh=fh))
finally:
os.close(fh)

def get_sparsemap_fd(fname):
with open(fname, 'rb') as fd:
return list(sparsemap(fd=fd))

fn = str(tmpdir / fname)
make_sparsefile(fn, sparse_map)
assert get_sparsemap_fh(fn) == sparse_map
assert get_sparsemap_fd(fn) == sparse_map


@pytest.mark.parametrize("fname, sparse_map, header_size, sparse", [
('sparse1', map_sparse1, 0, False),
('sparse1', map_sparse1, 0, True),
('sparse1', map_sparse1, BS, False),
('sparse1', map_sparse1, BS, True),
('sparse2', map_sparse2, 0, False),
('sparse2', map_sparse2, 0, True),
('sparse2', map_sparse2, BS, False),
('sparse2', map_sparse2, BS, True),
('onlysparse', map_onlysparse, 0, False),
('onlysparse', map_onlysparse, 0, True),
('onlysparse', map_onlysparse, BS, False),
('onlysparse', map_onlysparse, BS, True),
('notsparse', map_notsparse, 0, False),
('notsparse', map_notsparse, 0, True),
('notsparse', map_notsparse, BS, False),
('notsparse', map_notsparse, BS, True),
])
def test_chunkify_sparse(tmpdir, fname, sparse_map, header_size, sparse):

def get_chunks(fname, sparse, header_size):
chunker = ChunkerFixed(4096, header_size=header_size, sparse=sparse)
with open(fname, 'rb') as fd:
return b''.join([c for c in chunker.chunkify(fd)])

fn = str(tmpdir / fname)
make_sparsefile(fn, sparse_map, header_size=header_size)
get_chunks(fn, sparse=sparse, header_size=header_size) == make_content(sparse_map, header_size=header_size)

0 comments on commit a21ea5d

Please sign in to comment.