From 8620d9fda40ea242ba024990c4d6f4cf4e36415d Mon Sep 17 00:00:00 2001 From: Kamil Mankowski Date: Tue, 9 Jul 2024 11:49:29 +0200 Subject: [PATCH] FIX: Support for extracting data from archives with dirs When zip or tar archive contains directories, they appear in the default listings in addition to files they contain. It causes exceptions or extracting empty data, what eventually causes issues on creating a report message. --- CHANGELOG.md | 2 ++ intelmq/lib/utils.py | 4 ++-- intelmq/tests/assets/subdir.tar.gz | Bin 0 -> 183 bytes intelmq/tests/assets/subdir.tar.gz.license | 3 +++ intelmq/tests/assets/subdir.zip | Bin 0 -> 430 bytes intelmq/tests/assets/subdir.zip.license | 3 +++ .../bots/collectors/http/test_collector.py | 19 ++++++++++++++++++ intelmq/tests/lib/test_utils.py | 16 +++++++++++++++ 8 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 intelmq/tests/assets/subdir.tar.gz create mode 100644 intelmq/tests/assets/subdir.tar.gz.license create mode 100644 intelmq/tests/assets/subdir.zip create mode 100644 intelmq/tests/assets/subdir.zip.license diff --git a/CHANGELOG.md b/CHANGELOG.md index f6fb896a7..8de554638 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ ### Core - `intelmq.lib.utils.drop_privileges`: When IntelMQ is called as `root` and dropping the privileges to user `intelmq`, also set the non-primary groups associated with the `intelmq` user. Makes the behaviour of running intelmqctl as `root` closer to the behaviour of `sudo -u intelmq ...` (PR#2507 by Mikk Margus Möll). +- `intelmq.lib.utils.unzip`: Filter out directory entries when extracting data fixing the issue that + archives with directories causes extracting empty data for a directory entry (PR# by Kamil Mankowski). ### Development diff --git a/intelmq/lib/utils.py b/intelmq/lib/utils.py index 42d551ad9..de59a223a 100644 --- a/intelmq/lib/utils.py +++ b/intelmq/lib/utils.py @@ -538,7 +538,7 @@ def extract_tar(file): def extract(filename): return tar.extractfile(filename).read() - return tuple(file.name for file in tar.getmembers()), tar, extract + return tuple(file.name for file in tar.getmembers() if file.isfile()), tar, extract def extract_gzip(file): @@ -547,7 +547,7 @@ def extract_gzip(file): def extract_zip(file): zfp = zipfile.ZipFile(io.BytesIO(file), "r") - return zfp.namelist(), zfp, zfp.read + return [member.filename for member in zfp.infolist() if not member.is_dir()], zfp, zfp.read def unzip(file: bytes, extract_files: Union[bool, list], logger=None, diff --git a/intelmq/tests/assets/subdir.tar.gz b/intelmq/tests/assets/subdir.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..03daf10d51d353d2180e5c9cddb1d997e625abcd GIT binary patch literal 183 zcmV;o07(BIiwFS6_l#x$1MQSS3c@fDMYHx4a|36RNphYR6^crsNx|bg(x9#?E@F|) z+x$fa^26|w=Ke5D`_nj@Y9L}@21$w@-?KmjoE_VnVqG13I000N?S_%LF literal 0 HcmV?d00001 diff --git a/intelmq/tests/assets/subdir.tar.gz.license b/intelmq/tests/assets/subdir.tar.gz.license new file mode 100644 index 000000000..056d32ec6 --- /dev/null +++ b/intelmq/tests/assets/subdir.tar.gz.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2024 CERT.at GmbH + +SPDX-License-Identifier: AGPL-3.0-or-later diff --git a/intelmq/tests/assets/subdir.zip b/intelmq/tests/assets/subdir.zip new file mode 100644 index 0000000000000000000000000000000000000000..5fba87a8563e7317c513e6f8284e59156355245b GIT binary patch literal 430 zcmWIWW@Zs#W&nZUml0qZO0Waz;?ks)%p&~&pdv1?qNlY^-pjvuuyO(=L6{4nC@HZB zh!jdvD@wQ!8Xf>OO!rdv#-brDKOcx-8W@@EnQ=Kp1!xNh2sFHP1kq4uaWN=>cnpjT z5