Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up tar packing by lower compresslevel and create symbolic links for same files #887

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 56 additions & 10 deletions pkg/private/tar/build_tar.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ class TarFile(object):
class DebError(Exception):
pass

def __init__(self, output, directory, compression, compressor, create_parents, allow_dups_from_deps, default_mtime):
def __init__(self, output, directory, compression, compressor, create_parents,
allow_dups_from_deps, auto_deduplicate, default_mtime, compresslevel=None):
# Directory prefix on all output paths
d = directory.strip('/')
self.directory = (d + '/') if d else None
Expand All @@ -52,6 +53,8 @@ def __init__(self, output, directory, compression, compressor, create_parents, a
self.default_mtime = default_mtime
self.create_parents = create_parents
self.allow_dups_from_deps = allow_dups_from_deps
self.compresslevel = compresslevel
self.src_to_first_dest_map = {} if auto_deduplicate else None

def __enter__(self):
self.tarfile = tar_writer.TarFileWriter(
Expand All @@ -60,7 +63,8 @@ def __enter__(self):
self.compressor,
self.create_parents,
self.allow_dups_from_deps,
default_mtime=self.default_mtime)
default_mtime=self.default_mtime,
compresslevel=self.compresslevel)
return self

def __exit__(self, t, v, traceback):
Expand Down Expand Up @@ -98,6 +102,12 @@ def add_file(self, f, destfile, mode=None, ids=None, names=None):
copied to `self.directory/destfile` in the layer.
"""
dest = self.normalize_path(destfile)
if self.src_to_first_dest_map is not None:
normalized_src = normpath(f)
relative_path_to_link_to = self.auto_deduplicate(normalized_src, dest)
if relative_path_to_link_to:
self.add_link(dest, relative_path_to_link_to, mode=mode, ids=ids, names=names)
return
# If mode is unspecified, derive the mode from the file's mode.
if mode is None:
mode = 0o755 if os.access(f, os.X_OK) else 0o644
Expand All @@ -114,6 +124,23 @@ def add_file(self, f, destfile, mode=None, ids=None, names=None):
uname=names[0],
gname=names[1])

def auto_deduplicate(self, src_file, dest_file):
"""Detect whether to de-duplicate the destination file

Returns:
The relative path to create a symlink to or None
"""
if self.src_to_first_dest_map is not None:
first_dest = self.src_to_first_dest_map.get(src_file)
if first_dest is None:
real_src_file = os.path.realpath(src_file)
first_dest = self.src_to_first_dest_map.setdefault(real_src_file, dest_file)
self.src_to_first_dest_map[src_file] = first_dest
if first_dest != dest_file:
return os.path.relpath(first_dest, os.path.dirname(dest_file))
return None


def add_empty_file(self,
destfile,
mode=None,
Expand Down Expand Up @@ -269,13 +296,13 @@ def add_tree(self, tree_top, destpath, mode=None, ids=None, names=None):
for dir in dirs:
to_write[dest_dir + dir] = None
for file in sorted(files):
content_path = os.path.abspath(os.path.join(root, file))
content_path = os.path.join(root, file)
if os.name == "nt":
# "To specify an extended-length path, use the `\\?\` prefix. For
# example, `\\?\D:\very long path`."[1]
#
# [1]: https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation
to_write[dest_dir + file] = "\\\\?\\" + content_path
to_write[dest_dir + file] = "\\\\?\\" + os.path.abspath(content_path)
else:
to_write[dest_dir + file] = content_path

Expand All @@ -297,6 +324,10 @@ def add_tree(self, tree_top, destpath, mode=None, ids=None, names=None):
f_mode = 0o755 if os.access(content_path, os.X_OK) else 0o644
else:
f_mode = mode
relative_path_to_link_to = self.auto_deduplicate(content_path, dest)
if relative_path_to_link_to:
self.add_link(dest, relative_path_to_link_to, mode=f_mode, ids=ids, names=names)
continue
self.tarfile.add_file(
path,
file_content=content_path,
Expand Down Expand Up @@ -345,7 +376,7 @@ def main():
fromfile_prefix_chars='@')
parser.add_argument('--output', required=True,
help='The output file, mandatory.')
parser.add_argument('--manifest',
parser.add_argument('--manifest', action='append',
help='manifest of contents to add to the layer.')
parser.add_argument('--mode',
help='Force the mode on the added files (in octal).')
Expand All @@ -359,7 +390,7 @@ def main():
parser.add_argument('--deb', action='append',
help='A debian package to add to the layer')
parser.add_argument(
'--directory',
'--directory', action='append',
help='Directory in which to store the file inside the layer')

compression = parser.add_mutually_exclusive_group()
Expand Down Expand Up @@ -397,6 +428,12 @@ def main():
parser.add_argument('--allow_dups_from_deps',
action='store_true',
help='')
parser.add_argument('--auto_deduplicate',
action='store_true',
help='Auto create symlinks for files mapped from a same source in manifests.')
parser.add_argument(
'--compresslevel', default='',
help='Specify the numeric compress level in gzip mode; may be 0-9 or empty(6).')
options = parser.parse_args()

# Parse modes arguments
Expand Down Expand Up @@ -443,12 +480,14 @@ def main():
# Add objects to the tar file
with TarFile(
options.output,
directory = helpers.GetFlagValue(options.directory),
directory = helpers.GetFlagValue(options.directory[0]),
compression = options.compression,
compressor = options.compressor,
default_mtime=default_mtime,
create_parents=options.create_parents,
allow_dups_from_deps=options.allow_dups_from_deps) as output:
allow_dups_from_deps=options.allow_dups_from_deps,
auto_deduplicate=options.auto_deduplicate,
compresslevel = options.compresslevel) as output:

def file_attributes(filename):
if filename.startswith('/'):
Expand All @@ -459,12 +498,19 @@ def file_attributes(filename):
'names': names_map.get(filename, default_ownername),
}

if options.manifest:
with open(options.manifest, 'r') as manifest_fp:
normalized_first_directory = output.directory
manifest_list = zip(options.directory, options.manifest)
if options.auto_deduplicate:
manifest_list = list(manifest_list)[::-1]
for directory, manifest_path in manifest_list:
directory = helpers.GetFlagValue(directory)
output.directory = (directory.strip('/') + '/') if directory.strip('/') else None
with open(manifest_path, 'r') as manifest_fp:
manifest_entries = manifest.read_entries_from(manifest_fp)
for entry in manifest_entries:
output.add_manifest_entry(entry, file_attributes)

output.directory = normalized_first_directory
for tar in options.tar or []:
output.add_tar(tar)
for deb in options.deb or []:
Expand Down
92 changes: 87 additions & 5 deletions pkg/private/tar/tar.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ SUPPORTED_TAR_COMPRESSIONS = (
_DEFAULT_MTIME = -1
_stamp_condition = Label("//pkg/private:private_stamp_detect")

MappingManifestInfo = provider(
"Path mapping to pack files",
fields = {
"manifest": "a list of (expanded package_dir, manfiest files, deps).",
},
)

def _remap(remap_paths, path):
"""If path starts with a key in remap_paths, rewrite it."""
for prefix, replacement in remap_paths.items():
Expand Down Expand Up @@ -69,8 +76,11 @@ def _pkg_tar_impl(ctx):
if ctx.attr.package_dir_file:
if ctx.attr.package_dir:
fail("Both package_dir and package_dir_file attributes were specified")
if ctx.attr.merge_mappings:
fail("Can not merge tarball mappings when package_dir_file is specified")
args.add("--directory", "@" + ctx.file.package_dir_file.path)
files.append(ctx.file.package_dir_file)
package_dir_expanded = None
else:
package_dir_expanded = substitute_package_variables(ctx, ctx.attr.package_dir)
args.add("--directory", package_dir_expanded or "/")
Expand Down Expand Up @@ -114,6 +124,10 @@ def _pkg_tar_impl(ctx):
"--owner_names",
"%s=%s" % (_quote(key), ctx.attr.ownernames[key]),
)
if ctx.attr.compresslevel:
args.add("--compresslevel", ctx.attr.compresslevel)
if ctx.attr.auto_deduplicate:
args.add("--auto_deduplicate")

# Now we begin processing the files.
path_mapper = None
Expand Down Expand Up @@ -151,8 +165,6 @@ def _pkg_tar_impl(ctx):
add_empty_file(mapping_context, empty_file, ctx.label)
for empty_dir in ctx.attr.empty_dirs or []:
add_directory(mapping_context, empty_dir, ctx.label)
for f in ctx.files.deps:
args.add("--tar", f.path)
for link in ctx.attr.symlinks:
add_symlink(
mapping_context,
Expand All @@ -170,6 +182,29 @@ def _pkg_tar_impl(ctx):
write_manifest(ctx, manifest_file, mapping_context.content_map)
args.add("--manifest", manifest_file.path)

does_merge_mappings = ctx.attr.merge_mappings
new_dir_prefix = package_dir_expanded + "/" if package_dir_expanded else ""
manifest_list = [(package_dir_expanded, manifest_file, mapping_context.file_deps)]
file_dep_set = {}
for dep_i in ctx.attr.deps:
if does_merge_mappings and (MappingManifestInfo in dep_i):
for i_dir, i_manifest, i_file_deps in dep_i[MappingManifestInfo].manifest:
i_dir = new_dir_prefix + (i_dir or "")
args.add("--directory", i_dir)
args.add("--manifest", i_manifest.path)
files.append(i_manifest)
for i in i_file_deps:
file_dep_set[i] = 1
manifest_list.append((i_dir, i_manifest, i_file_deps))
else:
for dep_file in dep_i.files.to_list():
if does_merge_mappings and dep_file.path.startswith("bazel-out/"):
fail("Please avoid depending on generated .tar directly: " + dep_file.path)
args.add("--tar", dep_file.path)
files += dep_i.files.to_list()
for i in mapping_context.file_deps:
file_dep_set[i] = 1

args.set_param_file_format("flag_per_line")
args.use_param_file("@%s", use_always = False)

Expand All @@ -180,8 +215,8 @@ def _pkg_tar_impl(ctx):
args.add("--allow_dups_from_deps")

inputs = depset(
direct = ctx.files.deps + files,
transitive = mapping_context.file_deps,
direct = files,
transitive = list(file_dep_set.keys()),
)

ctx.actions.run(
Expand Down Expand Up @@ -212,7 +247,11 @@ def _pkg_tar_impl(ctx):
OutputGroupInfo(
manifest = [manifest_file],
),
]
] + ([
MappingManifestInfo(
manifest = manifest_list,
),
] if does_merge_mappings else [])

# A rule for creating a tar file, see README.md
pkg_tar_impl = rule(
Expand Down Expand Up @@ -256,6 +295,14 @@ pkg_tar_impl = rule(
"extension": attr.string(default = "tar"),
"symlinks": attr.string_dict(),
"empty_files": attr.string_list(),
"merge_mappings": attr.bool(
doc = """Repack tar files in `deps` by re-applying their manifest files.""",
default = False,
),
"auto_deduplicate": attr.bool(
doc = """Auto create symlinks for files mapped from a same source in manifests.""",
default = False,
),
"include_runfiles": attr.bool(
doc = ("""Include runfiles for executables. These appear as they would in bazel-bin."""
+ """For example: 'path/to/myprog.runfiles/path/to/my_data.txt'."""),
Expand All @@ -272,6 +319,10 @@ pkg_tar_impl = rule(
),
"create_parents": attr.bool(default = True),
"allow_duplicates_from_deps": attr.bool(default = False),
"compresslevel": attr.string(
doc = """Specify the numeric compress level in gzip mode; may be 0-9 or empty (6).""",
default = "",
),

# Common attributes
"out": attr.output(mandatory = True),
Expand Down Expand Up @@ -342,3 +393,34 @@ def pkg_tar(name, **kwargs):
}),
**kwargs
)

def _pkg_tar_group_impl(ctx):
manifest_list = []
output_files = []
for i in ctx.attr.srcs:
if MappingManifestInfo in i:
manifest_list += i[MappingManifestInfo].manifest
output_files += i.files.to_list()
if manifest_list and len(manifest_list) < len(output_files):
fail("Can not merge generated tar files and source ones; please split into different groups.")
return [
DefaultInfo(
files = depset(output_files),
),
MappingManifestInfo(
manifest = manifest_list,
),
]

pkg_tar_group = rule(
doc = """Expose a group of source tar files.""",
implementation = _pkg_tar_group_impl,
attrs = {
"srcs": attr.label_list(
doc = """Tar files generated by pkg_tar().""",
mandatory = True,
allow_files = tar_filetype,
),
},
provides = [MappingManifestInfo],
)
6 changes: 4 additions & 2 deletions pkg/private/tar/tar_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def __init__(self,
create_parents=False,
allow_dups_from_deps=True,
default_mtime=None,
preserve_tar_mtimes=True):
preserve_tar_mtimes=True,
compresslevel=None):
"""TarFileWriter wraps tarfile.open().

Args:
Expand Down Expand Up @@ -86,10 +87,11 @@ def __init__(self,
else:
mode = 'w:'
if compression in ['tgz', 'gz']:
compresslevel = int(compresslevel) if compresslevel or compresslevel == 0 else 6
# The Tarfile class doesn't allow us to specify gzip's mtime attribute.
# Instead, we manually reimplement gzopen from tarfile.py and set mtime.
self.fileobj = gzip.GzipFile(
filename=name, mode='w', compresslevel=6, mtime=self.default_mtime)
filename=name, mode='w', compresslevel=compresslevel, mtime=self.default_mtime)
self.compressor_proc = None
if self.compressor_cmd:
mode = 'w|'
Expand Down
3 changes: 2 additions & 1 deletion pkg/tar.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
"""Forwarder for pkg_tar."""

load("//pkg/private/tar:tar.bzl", _pkg_tar = "pkg_tar")
load("//pkg/private/tar:tar.bzl", _pkg_tar = "pkg_tar", _pkg_tar_group = "pkg_tar_group")

pkg_tar = _pkg_tar
pkg_tar_group = _pkg_tar_group