Skip to content

Commit

Permalink
Speed up tar packing by lowering compresslevel and creating symbolic …
Browse files Browse the repository at this point in the history
…links for same files
  • Loading branch information
gdh1995 committed Aug 26, 2024
1 parent 447fb8e commit ee4999c
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 18 deletions.
66 changes: 56 additions & 10 deletions pkg/private/tar/build_tar.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ class TarFile(object):
class DebError(Exception):
pass

def __init__(self, output, directory, compression, compressor, create_parents, allow_dups_from_deps, default_mtime):
def __init__(self, output, directory, compression, compressor, create_parents,
allow_dups_from_deps, auto_deduplicate, default_mtime, compresslevel=None):
# Directory prefix on all output paths
d = directory.strip('/')
self.directory = (d + '/') if d else None
Expand All @@ -52,6 +53,8 @@ def __init__(self, output, directory, compression, compressor, create_parents, a
self.default_mtime = default_mtime
self.create_parents = create_parents
self.allow_dups_from_deps = allow_dups_from_deps
self.compresslevel = compresslevel
self.src_to_first_dest_map = {} if auto_deduplicate else None

def __enter__(self):
self.tarfile = tar_writer.TarFileWriter(
Expand All @@ -60,7 +63,8 @@ def __enter__(self):
self.compressor,
self.create_parents,
self.allow_dups_from_deps,
default_mtime=self.default_mtime)
default_mtime=self.default_mtime,
compresslevel=self.compresslevel)
return self

def __exit__(self, t, v, traceback):
Expand Down Expand Up @@ -98,6 +102,12 @@ def add_file(self, f, destfile, mode=None, ids=None, names=None):
copied to `self.directory/destfile` in the layer.
"""
dest = self.normalize_path(destfile)
if self.src_to_first_dest_map is not None:
normalized_src = normpath(f)
relative_path_to_link_to = self.auto_deduplicate(normalized_src, dest)
if relative_path_to_link_to:
self.add_link(dest, relative_path_to_link_to, mode=mode, ids=ids, names=names)
return
# If mode is unspecified, derive the mode from the file's mode.
if mode is None:
mode = 0o755 if os.access(f, os.X_OK) else 0o644
Expand All @@ -114,6 +124,23 @@ def add_file(self, f, destfile, mode=None, ids=None, names=None):
uname=names[0],
gname=names[1])

def auto_deduplicate(self, src_file, dest_file):
"""Detect whether to de-duplicate the destination file
Returns:
The relative path to create a symlink to or None
"""
if self.src_to_first_dest_map is not None:
first_dest = self.src_to_first_dest_map.get(src_file)
if first_dest is None:
real_src_file = os.path.realpath(src_file)
first_dest = self.src_to_first_dest_map.setdefault(real_src_file, dest_file)
self.src_to_first_dest_map[src_file] = first_dest
if first_dest != dest_file:
return os.path.relpath(first_dest, os.path.dirname(dest_file))
return None


def add_empty_file(self,
destfile,
mode=None,
Expand Down Expand Up @@ -269,13 +296,13 @@ def add_tree(self, tree_top, destpath, mode=None, ids=None, names=None):
for dir in dirs:
to_write[dest_dir + dir] = None
for file in sorted(files):
content_path = os.path.abspath(os.path.join(root, file))
content_path = os.path.join(root, file)
if os.name == "nt":
# "To specify an extended-length path, use the `\\?\` prefix. For
# example, `\\?\D:\very long path`."[1]
#
# [1]: https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation
to_write[dest_dir + file] = "\\\\?\\" + content_path
to_write[dest_dir + file] = "\\\\?\\" + os.path.abspath(content_path)
else:
to_write[dest_dir + file] = content_path

Expand All @@ -297,6 +324,10 @@ def add_tree(self, tree_top, destpath, mode=None, ids=None, names=None):
f_mode = 0o755 if os.access(content_path, os.X_OK) else 0o644
else:
f_mode = mode
relative_path_to_link_to = self.auto_deduplicate(content_path, dest)
if relative_path_to_link_to:
self.add_link(dest, relative_path_to_link_to, mode=f_mode, ids=ids, names=names)
continue
self.tarfile.add_file(
path,
file_content=content_path,
Expand Down Expand Up @@ -345,7 +376,7 @@ def main():
fromfile_prefix_chars='@')
parser.add_argument('--output', required=True,
help='The output file, mandatory.')
parser.add_argument('--manifest',
parser.add_argument('--manifest', action='append',
help='manifest of contents to add to the layer.')
parser.add_argument('--mode',
help='Force the mode on the added files (in octal).')
Expand All @@ -359,7 +390,7 @@ def main():
parser.add_argument('--deb', action='append',
help='A debian package to add to the layer')
parser.add_argument(
'--directory',
'--directory', action='append',
help='Directory in which to store the file inside the layer')

compression = parser.add_mutually_exclusive_group()
Expand Down Expand Up @@ -397,6 +428,12 @@ def main():
parser.add_argument('--allow_dups_from_deps',
action='store_true',
help='')
parser.add_argument('--auto_deduplicate',
action='store_true',
help='Auto create symlinks for files mapped from a same source in manifests.')
parser.add_argument(
'--compresslevel', default='',
help='Specify the numeric compress level in gzip mode; may be 0-9 or empty(6).')
options = parser.parse_args()

# Parse modes arguments
Expand Down Expand Up @@ -443,12 +480,14 @@ def main():
# Add objects to the tar file
with TarFile(
options.output,
directory = helpers.GetFlagValue(options.directory),
directory = helpers.GetFlagValue(options.directory[0]),
compression = options.compression,
compressor = options.compressor,
default_mtime=default_mtime,
create_parents=options.create_parents,
allow_dups_from_deps=options.allow_dups_from_deps) as output:
allow_dups_from_deps=options.allow_dups_from_deps,
auto_deduplicate=options.auto_deduplicate,
compresslevel = options.compresslevel) as output:

def file_attributes(filename):
if filename.startswith('/'):
Expand All @@ -459,12 +498,19 @@ def file_attributes(filename):
'names': names_map.get(filename, default_ownername),
}

if options.manifest:
with open(options.manifest, 'r') as manifest_fp:
normalized_first_directory = output.directory
manifest_list = zip(options.directory, options.manifest)
if options.auto_deduplicate:
manifest_list = list(manifest_list)[::-1]
for directory, manifest_path in manifest_list:
directory = helpers.GetFlagValue(directory)
output.directory = (directory.strip('/') + '/') if directory.strip('/') else None
with open(manifest_path, 'r') as manifest_fp:
manifest_entries = manifest.read_entries_from(manifest_fp)
for entry in manifest_entries:
output.add_manifest_entry(entry, file_attributes)

output.directory = normalized_first_directory
for tar in options.tar or []:
output.add_tar(tar)
for deb in options.deb or []:
Expand Down
92 changes: 87 additions & 5 deletions pkg/private/tar/tar.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ SUPPORTED_TAR_COMPRESSIONS = (
_DEFAULT_MTIME = -1
_stamp_condition = Label("//pkg/private:private_stamp_detect")

MappingManifestInfo = provider(
"Path mapping to pack files",
fields = {
"manifest": "a list of (expanded package_dir, manfiest files, deps).",
},
)

def _remap(remap_paths, path):
"""If path starts with a key in remap_paths, rewrite it."""
for prefix, replacement in remap_paths.items():
Expand Down Expand Up @@ -69,8 +76,11 @@ def _pkg_tar_impl(ctx):
if ctx.attr.package_dir_file:
if ctx.attr.package_dir:
fail("Both package_dir and package_dir_file attributes were specified")
if ctx.attr.merge_mappings:
fail("Can not merge tarball mappings when package_dir_file is specified")
args.add("--directory", "@" + ctx.file.package_dir_file.path)
files.append(ctx.file.package_dir_file)
package_dir_expanded = None
else:
package_dir_expanded = substitute_package_variables(ctx, ctx.attr.package_dir)
args.add("--directory", package_dir_expanded or "/")
Expand Down Expand Up @@ -114,6 +124,10 @@ def _pkg_tar_impl(ctx):
"--owner_names",
"%s=%s" % (_quote(key), ctx.attr.ownernames[key]),
)
if ctx.attr.compresslevel:
args.add("--compresslevel", ctx.attr.compresslevel)
if ctx.attr.auto_deduplicate:
args.add("--auto_deduplicate")

# Now we begin processing the files.
path_mapper = None
Expand Down Expand Up @@ -151,8 +165,6 @@ def _pkg_tar_impl(ctx):
add_empty_file(mapping_context, empty_file, ctx.label)
for empty_dir in ctx.attr.empty_dirs or []:
add_directory(mapping_context, empty_dir, ctx.label)
for f in ctx.files.deps:
args.add("--tar", f.path)
for link in ctx.attr.symlinks:
add_symlink(
mapping_context,
Expand All @@ -170,6 +182,29 @@ def _pkg_tar_impl(ctx):
write_manifest(ctx, manifest_file, mapping_context.content_map)
args.add("--manifest", manifest_file.path)

does_merge_mappings = ctx.attr.merge_mappings
new_dir_prefix = package_dir_expanded + "/" if package_dir_expanded else ""
manifest_list = [(package_dir_expanded, manifest_file, mapping_context.file_deps)]
file_dep_set = {}
for dep_i in ctx.attr.deps:
if does_merge_mappings and (MappingManifestInfo in dep_i):
for i_dir, i_manifest, i_file_deps in dep_i[MappingManifestInfo].manifest:
i_dir = new_dir_prefix + (i_dir or "")
args.add("--directory", i_dir)
args.add("--manifest", i_manifest.path)
files.append(i_manifest)
for i in i_file_deps:
file_dep_set[i] = 1
manifest_list.append((i_dir, i_manifest, i_file_deps))
else:
for dep_file in dep_i.files.to_list():
if does_merge_mappings and dep_file.path.startswith("bazel-out/"):
fail("Please avoid depending on generated .tar directly: " + dep_file.path)
args.add("--tar", dep_file.path)
files += dep_i.files.to_list()
for i in mapping_context.file_deps:
file_dep_set[i] = 1

args.set_param_file_format("flag_per_line")
args.use_param_file("@%s", use_always = False)

Expand All @@ -180,8 +215,8 @@ def _pkg_tar_impl(ctx):
args.add("--allow_dups_from_deps")

inputs = depset(
direct = ctx.files.deps + files,
transitive = mapping_context.file_deps,
direct = files,
transitive = list(file_dep_set.keys()),
)

ctx.actions.run(
Expand Down Expand Up @@ -212,7 +247,11 @@ def _pkg_tar_impl(ctx):
OutputGroupInfo(
manifest = [manifest_file],
),
]
] + ([
MappingManifestInfo(
manifest = manifest_list,
),
] if does_merge_mappings else [])

# A rule for creating a tar file, see README.md
pkg_tar_impl = rule(
Expand Down Expand Up @@ -256,6 +295,14 @@ pkg_tar_impl = rule(
"extension": attr.string(default = "tar"),
"symlinks": attr.string_dict(),
"empty_files": attr.string_list(),
"merge_mappings": attr.bool(
doc = """Repack tar files in `deps` by re-applying their manifest files.""",
default = False,
),
"auto_deduplicate": attr.bool(
doc = """Auto create symlinks for files mapped from a same source in manifests.""",
default = False,
),
"include_runfiles": attr.bool(
doc = ("""Include runfiles for executables. These appear as they would in bazel-bin."""
+ """For example: 'path/to/myprog.runfiles/path/to/my_data.txt'."""),
Expand All @@ -272,6 +319,10 @@ pkg_tar_impl = rule(
),
"create_parents": attr.bool(default = True),
"allow_duplicates_from_deps": attr.bool(default = False),
"compresslevel": attr.string(
doc = """Specify the numeric compress level in gzip mode; may be 0-9 or empty (6).""",
default = "",
),

# Common attributes
"out": attr.output(mandatory = True),
Expand Down Expand Up @@ -342,3 +393,34 @@ def pkg_tar(name, **kwargs):
}),
**kwargs
)

def _pkg_tar_group_impl(ctx):
manifest_list = []
output_files = []
for i in ctx.attr.srcs:
if MappingManifestInfo in i:
manifest_list += i[MappingManifestInfo].manifest
output_files += i.files.to_list()
if manifest_list and len(manifest_list) < len(output_files):
fail("Can not merge generated tar files and source ones; please split into different groups.")
return [
DefaultInfo(
files = depset(output_files),
),
MappingManifestInfo(
manifest = manifest_list,
),
]

pkg_tar_group = rule(
doc = """Expose a group of source tar files.""",
implementation = _pkg_tar_group_impl,
attrs = {
"srcs": attr.label_list(
doc = """Tar files generated by pkg_tar().""",
mandatory = True,
allow_files = tar_filetype,
),
},
provides = [MappingManifestInfo],
)
6 changes: 4 additions & 2 deletions pkg/private/tar/tar_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def __init__(self,
create_parents=False,
allow_dups_from_deps=True,
default_mtime=None,
preserve_tar_mtimes=True):
preserve_tar_mtimes=True,
compresslevel=None):
"""TarFileWriter wraps tarfile.open().
Args:
Expand Down Expand Up @@ -86,10 +87,11 @@ def __init__(self,
else:
mode = 'w:'
if compression in ['tgz', 'gz']:
compresslevel = int(compresslevel) if compresslevel or compresslevel == 0 else 6
# The Tarfile class doesn't allow us to specify gzip's mtime attribute.
# Instead, we manually reimplement gzopen from tarfile.py and set mtime.
self.fileobj = gzip.GzipFile(
filename=name, mode='w', compresslevel=6, mtime=self.default_mtime)
filename=name, mode='w', compresslevel=compresslevel, mtime=self.default_mtime)
self.compressor_proc = None
if self.compressor_cmd:
mode = 'w|'
Expand Down
3 changes: 2 additions & 1 deletion pkg/tar.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
"""Forwarder for pkg_tar."""

load("//pkg/private/tar:tar.bzl", _pkg_tar = "pkg_tar")
load("//pkg/private/tar:tar.bzl", _pkg_tar = "pkg_tar", _pkg_tar_group = "pkg_tar_group")

pkg_tar = _pkg_tar
pkg_tar_group = _pkg_tar_group

0 comments on commit ee4999c

Please sign in to comment.