[Serve] Handle name collisions when unzipping directories in unzip_package (#21723)

Currently, the `unzip_package` function relies on `extract_file_and_remove_top_level_dir` to unzip and remove the top-level directory from archive working directories. However, `extract_file_and_remove_top_level_dir` uses `os.rename()` to remove the tld by manually unzipping each file from a zip file and moving it to the tld's parent. When the tld contains directories or files with the same name as the tld, `os.rename()` fails to move these files to the tld's parent because of the name collision between the file and the tld.

This change replaces `extract_file_and_remove_top_level_dir` with `remove_dir_from_filepaths`. Now, `unzip_package` unzips the entire zip file before `remove_dir_from_filepaths` moves all the tld's children to the tld's parent using `os.rename()`.

This edge case is tested in the new unit test `test_unzip_with_matching_subdirectory_names`. Additionally, `extract_file_and_remove_top_level_dir`'s unit test is replaced with `TestRemoveDirFromFilepaths`, which tests the new `remove_dir_from_filepaths` function.
This commit is contained in:
shrekris-anyscale 2022-01-21 13:27:28 -08:00 committed by GitHub
parent 2954bf9a48
commit 45eebdd6e3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 97 additions and 64 deletions

View file

@ -1,4 +1,5 @@
from enum import Enum
from tempfile import TemporaryDirectory
from filelock import FileLock
import hashlib
import logging
@ -526,22 +527,29 @@ def get_top_level_dir_from_compressed_package(package_path: str):
return top_level_directory
def extract_file_and_remove_top_level_dir(base_dir: str, fname: str,
zip_ref: ZipFile):
def remove_dir_from_filepaths(base_dir: str, rdir: str):
"""
Extracts fname file from zip_ref zip file, removes the top level directory
from fname's file path, and stores fname in the base_dir.
base_dir: String path of the directory containing rdir
rdir: String path of directory relative to base_dir whose contents should
be moved to its base_dir, its parent directory
Removes rdir from the filepaths of all files and directories inside it.
In other words, moves all the files inside rdir to the directory that
contains rdir. Assumes base_dir's contents and rdir's contents have no
name conflicts.
"""
fname_without_top_level_dir = "/".join(fname.split("/")[1:])
# Move rdir to a temporary directory, so its contents can be moved to
# base_dir without any name conflicts
with TemporaryDirectory() as tmp_dir:
os.rename(os.path.join(base_dir, rdir), os.path.join(tmp_dir, rdir))
# If this condition is false, it means there was no top-level directory,
# so we do nothing
if fname_without_top_level_dir:
zip_ref.extract(fname, base_dir)
# Shift children out of rdir and into base_dir
rdir_children = os.listdir(os.path.join(tmp_dir, rdir))
for child in rdir_children:
os.rename(
os.path.join(base_dir, fname),
os.path.join(base_dir, fname_without_top_level_dir))
os.path.join(tmp_dir, rdir, child),
os.path.join(base_dir, child))
def unzip_package(package_path: str,
@ -563,6 +571,8 @@ def unzip_package(package_path: str,
logger.debug(f"Unpacking {package_path} to {target_dir}")
with ZipFile(str(package_path), "r") as zip_ref:
zip_ref.extractall(target_dir)
if remove_top_level_directory:
top_level_directory = get_top_level_dir_from_compressed_package(
package_path)
@ -571,20 +581,8 @@ def unzip_package(package_path: str,
"a single top level directory. Make sure there "
"are no hidden files at the same level as the "
"top level directory.")
with ZipFile(str(package_path), "r") as zip_ref:
for fname in zip_ref.namelist():
extract_file_and_remove_top_level_dir(
base_dir=target_dir, fname=fname, zip_ref=zip_ref)
# Remove now-empty top_level_directory and any empty subdirectories
# left over from extract_file_and_remove_top_level_dir operations
leftover_top_level_directory = os.path.join(
target_dir, top_level_directory)
if os.path.isdir(leftover_top_level_directory):
shutil.rmtree(leftover_top_level_directory)
else:
with ZipFile(str(package_path), "r") as zip_ref:
zip_ref.extractall(target_dir)
remove_dir_from_filepaths(target_dir, top_level_directory)
if unlink_zip:
Path(package_path).unlink()

View file

@ -1,12 +1,11 @@
import os
from pathlib import Path
import random
from shutil import rmtree, make_archive
from shutil import copytree, rmtree, make_archive
import string
import sys
import tempfile
from filecmp import dircmp
from zipfile import ZipFile
import uuid
import pytest
@ -16,8 +15,8 @@ from ray.experimental.internal_kv import (_internal_kv_del,
from ray._private.runtime_env.packaging import (
_dir_travel, get_uri_for_directory, _get_excludes,
upload_package_if_needed, parse_uri, Protocol,
get_top_level_dir_from_compressed_package,
extract_file_and_remove_top_level_dir, unzip_package)
get_top_level_dir_from_compressed_package, remove_dir_from_filepaths,
unzip_package)
TOP_LEVEL_DIR_NAME = "top_level"
ARCHIVE_NAME = "archive.zip"
@ -168,26 +167,28 @@ class TestGetTopLevelDirFromCompressedPackage:
@pytest.mark.skipif(sys.platform == "win32", reason="Fail to create temp dir.")
class TestExtractFileAndRemoveTopLevelDir:
def test_valid_extraction(self, random_zip_file_with_top_level_dir):
class TestRemoveDirFromFilepaths:
def test_valid_removal(self, random_zip_file_with_top_level_dir):
# This test copies the TOP_LEVEL_DIR_NAME directory, and then it
# shifts the contents of the copied directory into the base tmp_path
# directory. Then it compares the contents of tmp_path with the
# TOP_LEVEL_DIR_NAME directory to ensure that they match.
archive_path = random_zip_file_with_top_level_dir
tmp_path = archive_path[:archive_path.rfind("/")]
rmtree(os.path.join(tmp_path, TOP_LEVEL_DIR_NAME))
with ZipFile(archive_path, "r") as zf:
for fname in zf.namelist():
extract_file_and_remove_top_level_dir(
base_dir=tmp_path, fname=fname, zip_ref=zf)
rmtree(os.path.join(tmp_path, TOP_LEVEL_DIR_NAME))
with ZipFile(archive_path, "r") as zf:
zf.extractall(tmp_path)
original_dir_path = os.path.join(tmp_path, TOP_LEVEL_DIR_NAME)
copy_dir_path = os.path.join(tmp_path, TOP_LEVEL_DIR_NAME + "_copy")
copytree(original_dir_path, copy_dir_path)
remove_dir_from_filepaths(tmp_path, TOP_LEVEL_DIR_NAME + "_copy")
dcmp = dircmp(tmp_path, f"{tmp_path}/{TOP_LEVEL_DIR_NAME}")
# Since this test uses the tmp_path as the target directory, and since
# the tmp_path also contains the zip file and the top level directory,
# make sure that the only difference between the tmp_path's contents
# and the top level directory's contents are the zip file and the top
# level directory itself. This implies that all files have been
# extracted from the top level directory and moved into the tmp_path.
# and the top level directory's contents are the zip file from the
# Pytest fixture and the top level directory itself. This implies that
# all files have been extracted from the top level directory and moved
# into the tmp_path.
assert set(dcmp.left_only) == {ARCHIVE_NAME, TOP_LEVEL_DIR_NAME}
# Make sure that all the subdirectories and files have been moved to
@ -198,17 +199,9 @@ class TestExtractFileAndRemoveTopLevelDir:
@pytest.mark.skipif(sys.platform == "win32", reason="Fail to create temp dir.")
@pytest.mark.parametrize("remove_top_level_directory", [False, True])
@pytest.mark.parametrize("unlink_zip", [False, True])
def test_unzip_package(random_zip_file_with_top_level_dir,
remove_top_level_directory, unlink_zip):
archive_path = random_zip_file_with_top_level_dir
tmp_path = archive_path[:archive_path.rfind("/")]
tmp_subdir = f"{tmp_path}/{TOP_LEVEL_DIR_NAME}_tmp"
unzip_package(
package_path=archive_path,
target_dir=tmp_subdir,
remove_top_level_directory=remove_top_level_directory,
unlink_zip=unlink_zip)
class TestUnzipPackage:
def dcmp_helper(self, remove_top_level_directory, unlink_zip, tmp_subdir,
tmp_path, archive_path):
dcmp = None
if remove_top_level_directory:
dcmp = dircmp(f"{tmp_subdir}", f"{tmp_path}/{TOP_LEVEL_DIR_NAME}")
@ -223,6 +216,48 @@ def test_unzip_package(random_zip_file_with_top_level_dir,
else:
assert Path(archive_path).is_file()
def test_unzip_package(self, random_zip_file_with_top_level_dir,
remove_top_level_directory, unlink_zip):
archive_path = random_zip_file_with_top_level_dir
tmp_path = archive_path[:archive_path.rfind("/")]
tmp_subdir = f"{tmp_path}/{TOP_LEVEL_DIR_NAME}_tmp"
unzip_package(
package_path=archive_path,
target_dir=tmp_subdir,
remove_top_level_directory=remove_top_level_directory,
unlink_zip=unlink_zip)
self.dcmp_helper(remove_top_level_directory, unlink_zip, tmp_subdir,
tmp_path, archive_path)
def test_unzip_with_matching_subdirectory_names(
self, remove_top_level_directory, unlink_zip):
with tempfile.TemporaryDirectory() as tmp_dir:
path = Path(tmp_dir)
top_level_dir = path / TOP_LEVEL_DIR_NAME
top_level_dir.mkdir(parents=True)
next_level_dir = top_level_dir
for _ in range(10):
dir1 = next_level_dir / TOP_LEVEL_DIR_NAME
dir1.mkdir(parents=True)
next_level_dir = dir1
make_archive(path / ARCHIVE_NAME[:ARCHIVE_NAME.rfind(".")], "zip",
path, TOP_LEVEL_DIR_NAME)
archive_path = str(path / ARCHIVE_NAME)
tmp_path = archive_path[:archive_path.rfind("/")]
tmp_subdir = f"{tmp_path}/{TOP_LEVEL_DIR_NAME}_tmp"
unzip_package(
package_path=archive_path,
target_dir=tmp_subdir,
remove_top_level_directory=remove_top_level_directory,
unlink_zip=unlink_zip)
self.dcmp_helper(remove_top_level_directory, unlink_zip,
tmp_subdir, tmp_path, archive_path)
@pytest.mark.skipif(sys.platform == "win32", reason="Fail to create temp dir.")
def test_travel():