From 7c2983c869ae88ca878af47f23d8b51c34b58c9e Mon Sep 17 00:00:00 2001 From: aiuto Date: Mon, 13 Feb 2023 23:25:09 -0500 Subject: [PATCH] pkg_zip: Some unicode file handling fixes and basic tests (#641) Add primitive zip unicode filename handling tests - Just make sure we can create an archive with non-ASCII file names. --- .bazelci/tests.yml | 1 + developers.md | 2 ++ pkg/private/manifest.py | 9 ++++++++- pkg/private/zip/build_zip.py | 23 ++++++++++++----------- tests/BUILD | 20 ++++++++++++++++++-- tests/mappings/BUILD | 1 + tests/zip/BUILD | 20 +++++++++++++++++++- tests/zip/unicode_test.py | 34 ++++++++++++++++++++++++++++++++++ tests/zip/zip_test_lib.py | 3 ++- 9 files changed, 97 insertions(+), 16 deletions(-) create mode 100644 tests/zip/unicode_test.py diff --git a/.bazelci/tests.yml b/.bazelci/tests.yml index 88e3699..bc997e6 100644 --- a/.bazelci/tests.yml +++ b/.bazelci/tests.yml @@ -47,6 +47,7 @@ win_tests: &win_tests # https://github.com/bazelbuild/bazel/issues?q=is%3Aissue+is%3Aopen+%2Bunicode+%2Bwindows+ - "-//tests/mappings:utf8_manifest_test" - "-//tests/mappings/filter_directory/..." + - "-//tests/zip:unicode_test" # See #387 - "-//tests/install/..." # rpmbuild(8) is not supported on Windows diff --git a/developers.md b/developers.md index 17a95c0..611b1e0 100644 --- a/developers.md +++ b/developers.md @@ -87,6 +87,8 @@ organization will commit to maintaining that feature and responding to issues. - We use Python 3. We are not trying to support Python 2 at all. - Always import with a full path from the root of the source tree. +- Compatibility back to python 3.6 (for CentOS 7 and Ubuntu 18.04 LTS) is + required. ### Dependencies diff --git a/pkg/private/manifest.py b/pkg/private/manifest.py index 0edda4e..57b7ddd 100644 --- a/pkg/private/manifest.py +++ b/pkg/private/manifest.py @@ -49,9 +49,16 @@ class ManifestEntry(object): def read_entries_from(fh): """Return a list of ManifestEntry's from `fh`""" - raw_entries = json.load(fh) + # Subtle: decode the content with read() rather than in json.load() because + # the load in older python releases (< 3.7?) does not know how to decode. + raw_entries = json.loads(fh.read()) return [ManifestEntry(**entry) for entry in raw_entries] +def read_entries_from_file(manifest_path): + """Return a list of ManifestEntry's from the manifest file at `path`""" + with open(manifest_path, 'r', encoding='utf-8') as fh: + return read_entries_from(fh) + def entry_type_to_string(et): """Entry type stringifier""" if et == ENTRY_IS_FILE: diff --git a/pkg/private/zip/build_zip.py b/pkg/private/zip/build_zip.py index 4ef8f65..64c4206 100644 --- a/pkg/private/zip/build_zip.py +++ b/pkg/private/zip/build_zip.py @@ -142,8 +142,9 @@ class ZipWriter(object): if entry_type == manifest.ENTRY_IS_FILE: entry_info.compress_type = zipfile.ZIP_DEFLATED - with open(src, 'rb') as src: - self.zip_file.writestr(entry_info, src.read()) + # Using utf-8 for the file names is for python <3.7 compatibility. + with open(src.encode('utf-8'), 'rb') as src_content: + self.zip_file.writestr(entry_info, src_content.read()) elif entry_type == manifest.ENTRY_IS_DIR: entry_info.compress_type = zipfile.ZIP_STORED # Set directory bits @@ -153,7 +154,7 @@ class ZipWriter(object): entry_info.compress_type = zipfile.ZIP_STORED # Set directory bits entry_info.external_attr |= (UNIX_SYMLINK_BIT << 16) - self.zip_file.writestr(entry_info, src) + self.zip_file.writestr(entry_info, src.encode('utf-8')) elif entry_type == manifest.ENTRY_IS_TREE: self.add_tree(src, dst_path, mode) elif entry_type == manifest.ENTRY_IS_EMPTY_FILE: @@ -226,9 +227,10 @@ class ZipWriter(object): entry_info.external_attr |= (UNIX_DIR_BIT << 16) | MSDOS_DIR_BIT self.zip_file.writestr(entry_info, '') -def _load_manifest(prefix, manifest_fp): +def _load_manifest(prefix, manifest_path): manifest_map = {} - for entry in manifest.read_entries_from(manifest_fp): + + for entry in manifest.read_entries_from_file(manifest_path): entry.dest = _combine_paths(prefix, entry.dest) manifest_map[entry.dest] = entry @@ -263,12 +265,11 @@ def main(args): if args.mode: default_mode = int(args.mode, 8) - with open(args.manifest, 'r') as manifest_fp: - manifest = _load_manifest(args.directory, manifest_fp) - with ZipWriter( - args.output, time_stamp=ts, default_mode=default_mode) as zip_out: - for entry in manifest: - zip_out.add_manifest_entry(entry) + manifest = _load_manifest(args.directory, args.manifest) + with ZipWriter( + args.output, time_stamp=ts, default_mode=default_mode) as zip_out: + for entry in manifest: + zip_out.add_manifest_entry(entry) if __name__ == '__main__': diff --git a/tests/BUILD b/tests/BUILD index 7f4c989..21b6396 100644 --- a/tests/BUILD +++ b/tests/BUILD @@ -14,11 +14,10 @@ # -*- coding: utf-8 -*- load("@rules_python//python:defs.bzl", "py_test") -load("@bazel_skylib//rules:copy_file.bzl", "copy_file") load(":my_package_name.bzl", "my_package_naming") load(":path_test.bzl", "path_tests") load("//pkg:deb.bzl", "pkg_deb") -load("//pkg:mappings.bzl", "pkg_attributes", "pkg_mkdirs") +load("//pkg:mappings.bzl", "pkg_attributes", "pkg_files", "strip_prefix") load("//pkg:tar.bzl", "pkg_tar") load("//pkg:zip.bzl", "pkg_zip") @@ -53,6 +52,23 @@ filegroup( srcs = glob(["**/*.txt"]), ) +# +# Data source for Unicode handling tests +# +pkg_files( + name = "utf8_files", + srcs = [ + "//tests/testdata/utf8:files", + ], + # Note: This is temporary. We need to fix a latent bug where + # we are using 555 as the default for many things. That was the + # Google internal behavior. + # See https://github.com/bazelbuild/rules_pkg/issues/302 for thoughts. + attributes = pkg_attributes(mode = "0o555"), + strip_prefix = strip_prefix.from_pkg(), + visibility = ["//tests:__subpackages__"], +) + py_test( name = "archive_test", srcs = [ diff --git a/tests/mappings/BUILD b/tests/mappings/BUILD index 8de6c51..44752cb 100644 --- a/tests/mappings/BUILD +++ b/tests/mappings/BUILD @@ -109,6 +109,7 @@ pkg_files( "//tests/testdata/utf8:files", ], strip_prefix = strip_prefix.from_pkg(), + visibility = ["//tests:__subpackages__"], ) write_content_manifest( diff --git a/tests/zip/BUILD b/tests/zip/BUILD index 56c739b..0a545ec 100644 --- a/tests/zip/BUILD +++ b/tests/zip/BUILD @@ -256,7 +256,6 @@ py_test( python_version = "PY3", deps = [ ":zip_test_lib", - "@bazel_tools//tools/python/runfiles", ], ) @@ -282,3 +281,22 @@ py_test( ":zip_test_lib", ], ) + +pkg_zip( + name = "unicode_names", + srcs = ["//tests:utf8_files"] +) + +py_test( + name = "unicode_test", + srcs = [ + "unicode_test.py", + ], + data = [ + ":unicode_names", + ], + python_version = "PY3", + deps = [ + ":zip_test_lib", + ], +) diff --git a/tests/zip/unicode_test.py b/tests/zip/unicode_test.py new file mode 100644 index 0000000..641394a --- /dev/null +++ b/tests/zip/unicode_test.py @@ -0,0 +1,34 @@ +# Copyright 2022 The Bazel Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from tests.zip import zip_test_lib + +class UnicodeHandlingTests(zip_test_lib.ZipContentsTestBase): + + def test_unicode_file_names(self): + self.assertZipFileContent("unicode_names.zip", [ + {"filename": "1-a"}, + {"filename": "2-λ"}, + {"filename": "3-世"}, + {"filename": "BUILD"}, + {"filename": "sübdir/", "isdir": True}, + {"filename": "sübdir/2-λ"}, + {"filename": "sübdir/hello"}, + ]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/zip/zip_test_lib.py b/tests/zip/zip_test_lib.py index 2832df8..ada8303 100644 --- a/tests/zip/zip_test_lib.py +++ b/tests/zip/zip_test_lib.py @@ -91,4 +91,5 @@ class ZipContentsTestBase(ZipTest): # legacy rule implementation. attr = 0o555 self.assertEqual(oct((info.external_attr >> 16) & UNIX_RWX_BITS), - oct(attr)) + oct(attr), + msg = info.filename)