浏览代码

Manylinux downloader and extractor

Valentin Niess 8 月之前
父节点
当前提交
1fdb439e70

+ 1 - 1
python_appimage/appimage/relocate.py

@@ -77,7 +77,7 @@ _excluded_libs = None
 
 
 def patch_binary(path, libdir, recursive=True):
-    '''Patch the RPATH of a binary and and fetch its dependencies
+    '''Patch the RPATH of a binary and fetch its dependencies
     '''
     global _excluded_libs
 

+ 8 - 0
python_appimage/manylinux/__init__.py

@@ -0,0 +1,8 @@
+from .config import Arch, LinuxTag, PythonImpl, PythonVersion
+from .download import Downloader
+from .extract import ImageExtractor, PythonExtractor
+
+
+__all__ = ['Arch', 'Downloader', 'ImageExtractor', 'LinuxTag',
+           'PythonExtractor', 'PythonImpl', 'PythonVersion']
+

+ 81 - 0
python_appimage/manylinux/config.py

@@ -0,0 +1,81 @@
+from enum import auto, Enum
+import platform
+from typing import NamedTuple, Union
+
+
+__all__ = ['Arch', 'PythonImpl', 'PythonVersion']
+
+
+class Arch(Enum):
+    '''Supported platform architectures.'''
+    AARCH64 = auto()
+    I686 = auto()
+    X86_64 = auto()
+
+    def __str__(self):
+        return self.name.lower()
+
+    @classmethod
+    def from_host(cls) -> 'Arch':
+        return cls.from_str(platform.machine())
+
+    @classmethod
+    def from_str(cls, value) -> 'Arch':
+        for arch in cls:
+            if value == str(arch):
+                return arch
+        else:
+            raise NotImplementedError(value)
+
+
+class LinuxTag(Enum):
+    '''Supported platform tags.'''
+    MANYLINUX_1 = auto()
+    MANYLINUX_2010 = auto()
+    MANYLINUX_2014 = auto()
+    MANYLINUX_2_24 = auto()
+    MANYLINUX_2_28 = auto()
+
+    def __str__(self):
+        tag = self.name.lower()
+        if self in (LinuxTag.MANYLINUX_1, LinuxTag.MANYLINUX_2010,
+                    LinuxTag.MANYLINUX_2014):
+            return tag.replace('_', '')
+        else:
+            return tag
+
+    @classmethod
+    def from_str(cls, value) -> 'LinuxTag':
+        for tag in cls:
+            if value == str(tag):
+                return tag
+        else:
+            raise NotImplementedError(value)
+
+
+class PythonImpl(Enum):
+    '''Supported Python implementations.'''
+    CPYTHON = auto()
+
+
+class PythonVersion(NamedTuple):
+    ''''''
+
+    major: int
+    minor: int
+    patch: Union[int, str]
+
+    @classmethod
+    def from_str(cls, value: str) -> 'PythonVersion':
+        major, minor, patch = value.split('.', 2)
+        try:
+            patch = int(patch)
+        except ValueError:
+            pass
+        return cls(int(major), int(minor), patch)
+
+    def long(self) -> str:
+        return f'{self.major}.{self.minor}.{self.patch}'
+
+    def short(self) -> str:
+        return f'{self.major}.{self.minor}'

+ 145 - 0
python_appimage/manylinux/download.py

@@ -0,0 +1,145 @@
+import collections
+from dataclasses import dataclass, field
+import glob
+import hashlib
+import json
+from pathlib import Path
+import requests
+import shutil
+import tempfile
+from typing import List, Optional
+
+from .config import Arch, LinuxTag
+from ..utils.log import debug, log
+
+
+CHUNK_SIZE = 8189
+
+SUCCESS = 200
+
+
+class DownloadError(Exception):
+    pass
+
+class TarError(Exception):
+    pass
+
+
+@dataclass(frozen=True)
+class Downloader:
+
+    '''Manylinux tag.'''
+    tag: LinuxTag
+
+    '''Platform architecture.'''
+    arch: Optional[Arch] = None
+
+    '''Docker image.'''
+    image: str = field(init=False)
+
+    '''Authentication token.'''
+    token: str = field(init=False)
+
+
+    def __post_init__(self):
+        # Set host arch if not explictly specified.
+        if self.arch is None:
+            arch = Arch.from_host()
+            object.__setattr__(self, 'arch', arch)
+
+        # Set image name.
+        image = f'{self.tag}_{self.arch}'
+        object.__setattr__(self, 'image', image)
+
+
+    def download(
+        self,
+        destination: Optional[Path]=None,
+        tag: Optional[str] = 'latest'):
+
+        destination = destination or Path(self.image)
+
+        # Authenticate to quay.io.
+        repository = f'pypa/{self.image}'
+        url = 'https://quay.io/v2/auth'
+        url = f'{url}?service=quay.io&scope=repository:{repository}:pull'
+        debug('GET', url)
+        r = requests.request('GET', url)
+        if r.status_code == SUCCESS:
+            object.__setattr__(self, 'token', r.json()['token'])
+        else:
+            raise DownloadError(r.status_code, r.text, r.headers)
+
+        # Fetch image manifest.
+        repository = f'pypa/{self.image}'
+        url = f'https://quay.io/v2/{repository}/manifests/{tag}'
+        headers = {
+            'Authorization': f'Bearer {self.token}',
+            'Accept': 'application/vnd.docker.distribution.manifest.v2+json'
+        }
+        debug('GET', url)
+        r = requests.request('GET', url, headers=headers)
+        if r.status_code == SUCCESS:
+            image_digest = r.headers['Docker-Content-Digest'].split(':', 1)[-1]
+            manifest = r.json()
+        else:
+            raise DownloadError(r.status_code, r.text, r.headers)
+
+        # Check missing layers to download.
+        required = [layer['digest'].split(':', 1)[-1] for layer in
+                    manifest['layers']]
+        is_missing = lambda hash_: \
+            not (destination / f'layers/{hash_}.tar.gz').exists()
+        missing = tuple(filter(is_missing, required))
+
+        # Fetch missing layers.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            workdir = Path(tmpdir)
+            for i, hash_ in enumerate(missing):
+                log('DOWNLOAD', f'{self.image} ({tag}) '
+                                f'[{i + 1} / {len(missing)}]')
+
+                filename = f'{hash_}.tar.gz'
+                url = f'https://quay.io/v2/{repository}/blobs/sha256:{hash_}'
+                debug('GET', url)
+                r = requests.request('GET', url, headers=headers, stream=True)
+                if r.status_code == SUCCESS:
+                    debug('STREAM', filename)
+                else:
+                    raise DownloadError(r.status_code, r.text, r.headers)
+
+                hasher = hashlib.sha256()
+                tmp = workdir / 'layer.tgz'
+                with open(tmp, "wb") as f:
+                    for chunk in r.iter_content(CHUNK_SIZE): 
+                        if chunk:
+                            f.write(chunk)
+                            hasher.update(chunk)
+
+                    h = hasher.hexdigest()
+                    if h != hash_:
+                        raise DownloadError(
+                            f'bad hash (expected {name}, found {h})'
+                        )
+                layers_dir = destination / 'layers'
+                layers_dir.mkdir(exist_ok=True, parents=True)
+                shutil.move(tmp, layers_dir / filename)
+
+        tags_dir = destination / 'tags'
+        tags_dir.mkdir(exist_ok=True, parents=True)
+        with open(tags_dir / f'{tag}.json', "w") as f:
+            json.dump({'digest': image_digest, 'layers': required}, f)
+
+        # Remove unused layers.
+        required = set(required)
+        for tag in glob.glob(str(destination / 'tags/*.json')):
+            with open(tag) as f:
+                tag = json.load(f)
+                required |= set(tag["layers"])
+        required = [f'{hash_}.tar.gz' for hash_ in required]
+
+        for layer in glob.glob(str(destination / 'layers/*.tar.gz')):
+            layer = Path(layer)
+            if layer.name not in required:
+                debug('REMOVE', f'{self.image} [layer/{layer.stem}]')
+                layer.unlink()

+ 327 - 0
python_appimage/manylinux/extract.py

@@ -0,0 +1,327 @@
+from dataclasses import dataclass, field
+from distutils.version import LooseVersion
+import glob
+import json
+import os
+import re
+from pathlib import Path
+import shutil
+import stat
+import subprocess
+from typing import Dict, List, NamedTuple, Optional, Union
+
+from .config import Arch, PythonImpl, PythonVersion
+from ..utils.deps import ensure_excludelist, EXCLUDELIST
+from ..utils.log import debug, log
+
+
+@dataclass(frozen=True)
+class PythonExtractor:
+    '''Python extractor from an extracted Manylinux image.'''
+
+    arch: Arch
+    '''Target architecture'''
+
+    prefix: Path
+    '''Target image path'''
+
+    tag: str
+    '''Python binary tag'''
+
+
+    excludelist: Optional[Path] = None
+    '''Exclude list for shared libraries.'''
+
+    patchelf: Optional[Path] = None
+    '''Patchelf executable.'''
+
+
+    excluded: List[str] = field(init=False)
+    '''Excluded shared libraries.'''
+
+    impl: PythonImpl = field(init=False)
+    '''Python implementation'''
+
+    library_path: List[str] = field(init=False)
+    '''Search paths for libraries (LD_LIBRARY_PATH)'''
+
+    python_prefix: Path = field(init=False)
+    '''Python installation prefix'''
+
+    version: PythonVersion = field(init=False)
+    '''Python version'''
+
+
+    def __post_init__(self):
+        # Locate Python installation.
+        link = os.readlink(self.prefix / f'opt/python/{self.tag}')
+        if not link.startswith('/'):
+            raise NotImplementedError()
+        object.__setattr__(self, 'python_prefix', self.prefix / link[1:])
+
+        # Parse implementation and version.
+        head, tail = Path(link).name.split('-', 1)
+        if head == 'cpython':
+            impl = PythonImpl.CPYTHON
+            version = PythonVersion.from_str(tail)
+        else:
+            raise NotImplementedError()
+        object.__setattr__(self, 'impl', impl)
+        object.__setattr__(self, 'version', version)
+
+        # Set libraries search path.
+        paths = []
+        if self.arch in (Arch.AARCH64, Arch.X86_64):
+            paths.append(self.prefix / 'lib64')
+        elif self.arch == Arch.I686:
+            paths.append(self.prefix / 'lib')
+        else:
+            raise NotImplementedError()
+        paths.append(self.prefix / 'usr/local/lib')
+
+        ssl = glob.glob(str(self.prefix / 'opt/_internal/openssl-*'))
+        if ssl:
+            paths.append(Path(ssl[0]) / 'lib')
+
+        object.__setattr__(self, 'library_path', paths)
+
+        # Set excluded libraries.
+        if self.excludelist:
+            excludelist = Path(self.excludelist)
+        else:
+            ensure_excludelist()
+            excludelist = Path(EXCLUDELIST)
+        excluded = []
+        with excludelist.open() as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    excluded.append(line)
+        object.__setattr__(self, 'excluded', excluded)
+
+        # Set patchelf, if not provided.
+        if self.patchelf is None:
+            paths = (
+                Path(__file__).parent / 'bin',
+                Path.home() / '.local/bin'
+            )
+            for path in paths:
+                patchelf = path / 'patchelf'
+                if patchelf.exists():
+                    break
+            else:
+                raise NotImplementedError()
+            object.__setattr__(self, 'patchelf', patchelf)
+        else:
+            assert(self.patchelf.exists())
+
+
+    def extract(self, destination):
+        '''Extract Python runtime.'''
+
+        python = f'python{self.version.short()}'
+        runtime = f'bin/{python}'
+        packages = f'lib/{python}'
+        pip = f'bin/pip{self.version.short()}'
+
+        # Locate include files.
+        include = glob.glob(str(self.python_prefix / 'include/*'))
+        if include:
+            include = Path(include[0]).name
+            include = f'include/{include}'
+        else:
+            raise NotImplementedError()
+
+        # Clone Python runtime.
+        (destination / 'bin').mkdir(exist_ok=True, parents=True)
+        shutil.copy(self.python_prefix / runtime, destination / runtime)
+
+        short = Path(destination / f'bin/python{self.version.major}')
+        short.unlink(missing_ok=True)
+        short.symlink_to(python)
+        short = Path(destination / 'bin/python')
+        short.unlink(missing_ok=True)
+        short.symlink_to(f'python{self.version.major}')
+
+        # Clone pip wrapper.
+        with open(self.python_prefix / pip) as f:
+            f.readline() # Skip shebang.
+            body = f.read()
+
+        with open(destination / pip, 'w') as f:
+            f.write('#! /bin/sh\n')
+            f.write(' '.join((
+                '"exec"',
+                f'"$(dirname $(readlink -f ${0}))/{python}"',
+                '"$0"',
+                '"$@"\n'
+            )))
+            f.write(body)
+        shutil.copymode(self.python_prefix / pip, destination / pip)
+
+        short = Path(destination / f'bin/pip{self.version.major}')
+        short.unlink(missing_ok=True)
+        short.symlink_to(f'pip{self.version.short()}')
+        short = Path(destination / 'bin/pip')
+        short.unlink(missing_ok=True)
+        short.symlink_to(f'pip{self.version.major}')
+
+        # Clone Python packages.
+        for folder in (packages, include):
+            shutil.copytree(self.python_prefix / folder, destination / folder,
+                            symlinks=True, dirs_exist_ok=True)
+
+        # Remove some clutters.
+        shutil.rmtree(destination / packages / 'test', ignore_errors=True)
+        for root, dirs, files in os.walk(destination / packages):
+            root = Path(root)
+            for d in dirs:
+                if d == '__pycache__':
+                    shutil.rmtree(root / d, ignore_errors=True)
+            for f in files:
+                if f.endswith('.pyc'):
+                    (root / f).unlink()
+
+        # Map binary dependencies.
+        libs = self.ldd(self.python_prefix / f'bin/{python}')
+        path = Path(self.python_prefix / f'{packages}/lib-dynload')
+        for module in glob.glob(str(path / "*.so")):
+            l = self.ldd(module)
+            libs.update(l)
+
+        # Copy and patch binary dependencies.
+        libdir = destination / 'lib'
+        for (name, src) in libs.items():
+            dst = libdir / name
+            shutil.copy(src, dst, follow_symlinks=True)
+            # Some libraries are read-only, which prevents overriding the
+            # destination directory. Below, we change the permission of
+            # destination files to read-write (for the owner).
+            mode = dst.stat().st_mode
+            if not (mode & stat.S_IWUSR):
+                mode = mode | stat.S_IWUSR
+                dst.chmod(mode)
+
+            self.set_rpath(dst, '$ORIGIN')
+
+        # Patch RPATHs of binary modules.
+        path = Path(destination / f'{packages}/lib-dynload')
+        for module in glob.glob(str(path / "*.so")):
+            src = Path(module)
+            dst = os.path.relpath(libdir, src.parent)
+            self.set_rpath(src, f'$ORIGIN/{dst}')
+
+        # Patch RPATHs of Python runtime.
+        src = destination / runtime
+        dst = os.path.relpath(libdir, src.parent)
+        self.set_rpath(src, f'$ORIGIN/{dst}')
+
+        # Copy SSL certificates (i.e. clone certifi).
+        certs = self.prefix / 'opt/_internal/certs.pem'
+        if certs.is_symlink():
+            dst = self.prefix / str(certs.readlink())[1:]
+            certifi = dst.parent
+            assert(certifi.name == 'certifi')
+            site_packages = certifi.parent
+            assert(site_packages.name == 'site-packages')
+
+            for src in glob.glob(str(site_packages / 'certifi*')):
+                src = Path(src)
+                dst = destination / f'{packages}/site-packages/{src.name}'
+                if not dst.exists():
+                    shutil.copytree(src, dst, symlinks=True)
+        else:
+            raise NotImplementedError()
+
+        # Copy Tcl & Tk data.
+        tcltk_src = self.prefix / 'usr/local/lib'
+        tx_version = []
+        for match in glob.glob(str(tcltk_src / 'tk*')):
+            path = Path(match)
+            if path.is_dir():
+                tx_version.append(LooseVersion(path.name[2:]))
+        tx_version.sort()
+        tx_version = tx_version[-1]
+
+        tcltk_dir = Path(destination / 'usr/share/tcltk')
+        tcltk_dir.mkdir(exist_ok=True, parents=True)
+
+        for tx in ('tcl', 'tk'):
+            name = f'{tx}{tx_version}'
+            src = tcltk_src / name
+            dst = tcltk_dir / name
+            shutil.copytree(src, dst, symlinks=True, dirs_exist_ok=True)
+
+
+    def ldd(self, target: Path) -> Dict[str, Path]:
+        '''Cross-platform implementation of ldd, using readelf.'''
+
+        pattern = re.compile(r'[(]NEEDED[)]\s+Shared library:\s+\[([^\]]+)\]')
+        dependencies = dict()
+
+        def recurse(target: Path):
+            result = subprocess.run(f'readelf -d {target}', shell=True,
+                                    check=True, capture_output=True)
+            stdout = result.stdout.decode()
+            matches = pattern.findall(stdout)
+
+            for match in matches:
+                if (match not in dependencies) and (match not in self.excluded):
+                    path = self.locate_library(match)
+                    dependencies[match] = path
+                    subs = recurse(path)
+
+        recurse(target)
+        return dependencies
+
+
+    def locate_library(self, name: str) -> Path:
+        '''Locate a library given its qualified name.'''
+
+        for dirname in self.library_path:
+            path = dirname / name
+            if path.exists():
+                return path
+        else:
+            raise FileNotFoundError(name)
+
+
+    def set_rpath(self, target, rpath):
+        cmd = f'{self.patchelf} --print-rpath {target}'
+        result = subprocess.run(cmd, shell=True, check=True,
+                                capture_output=True)
+        current_rpath = result.stdout.decode().strip()
+        if current_rpath != rpath:
+            cmd = f"{self.patchelf} --set-rpath '{rpath}' {target}"
+            subprocess.run(cmd, shell=True, check=True, capture_output=True)
+
+
+@dataclass(frozen=True)
+class ImageExtractor:
+    '''Manylinux image extractor from layers.'''
+
+    prefix: Path
+    '''Manylinux image prefix.'''
+
+    tag: Optional[str] = 'latest'
+    '''Manylinux image tag.'''
+
+
+    def extract(self, destination: Path):
+        '''Extract Manylinux image.'''
+
+        with open(self.prefix / f'tags/{self.tag}.json') as f:
+            meta = json.load(f)
+        layers = meta['layers']
+
+        for layer in layers:
+            debug('EXTRACT', f'{layer}.tar.gz')
+
+            filename = self.prefix / f'layers/{layer}.tar.gz'
+            cmd = ' && '.join((
+                 f'mkdir -p {destination}',
+                 f'tar -xzf {filename} -C {destination}',
+                 f'chmod u+rw -R {destination}'
+            ))
+            process = subprocess.run(cmd, shell=True, check=True,
+                                     capture_output=True)

+ 15 - 14
python_appimage/utils/deps.py

@@ -9,28 +9,30 @@ from .tmp import TemporaryDirectory
 from .url import urlretrieve
 
 
-__all__ = ['APPIMAGETOOL', 'EXCLUDELIST', 'PATCHELF', 'PREFIX',
-           'ensure_appimagetool', 'ensure_excludelist', 'ensure_patchelf']
-
-
 _ARCH = platform.machine()
 
+_CACHE_DIR = os.path.expanduser('~/.cache/python-appimage')
+
 
 PREFIX = os.path.abspath(os.path.dirname(__file__) + '/..')
 '''Package installation prefix'''
 
-APPIMAGETOOL_DIR = os.path.expanduser('~/.local/bin')
+APPIMAGETOOL_DIR = os.path.join(_CACHE_DIR, 'bin')
 '''Location of the appimagetool binary'''
 
 APPIMAGETOOL_VERSION = '12'
 '''Version of the appimagetool binary'''
 
-EXCLUDELIST = PREFIX + '/data/excludelist'
+EXCLUDELIST = os.path.join(_CACHE_DIR, 'share/excludelist')
 '''AppImage exclusion list'''
 
-PATCHELF = os.path.expanduser('~/.local/bin/patchelf')
+PATCHELF = os.path.join(_CACHE_DIR, 'bin/patchelf')
 '''Location of the PatchELF binary'''
 
+PATCHELF_VERSION = '0.14.3'
+'''Version of the patchelf binary'''
+
+
 def ensure_appimagetool(dry=False):
     '''Fetch appimagetool from the web if not available locally
     '''
@@ -91,19 +93,18 @@ def ensure_patchelf():
     if os.path.exists(PATCHELF):
         return False
 
-    iarch = 'i386' if _ARCH == 'i686' else _ARCH
-    appimage = 'patchelf-{0:}.AppImage'.format(iarch)
-    baseurl = 'https://github.com/niess/patchelf.appimage/releases/download'
+    tgz = '-'.join(('patchelf', _PATCHELF_VERSION, _ARCH)) + '.tar.gz'
+    baseurl = 'https://github.com/NixOS/patchelf'
     log('INSTALL', 'patchelf from %s', baseurl)
 
     dirname = os.path.dirname(PATCHELF)
     patchelf = dirname + '/patchelf'
     make_tree(dirname)
     with TemporaryDirectory() as tmpdir:
-        urlretrieve(os.path.join(baseurl, 'rolling', appimage), appimage)
-        os.chmod(appimage, stat.S_IRWXU)
-        system(('./' + appimage, '--appimage-extract'))
-        copy_file('squashfs-root/usr/bin/patchelf', patchelf)
+        urlretrieve(os.path.join(baseurl, 'releases', 'download',
+                                 _PATCHELF_VERSION, tgz), tgz)
+        system(('tar', 'xzf', tgz))
+        copy_file('bin/patchelf', patchelf)
     os.chmod(patchelf, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
 
     return True

+ 4 - 0
python_appimage/utils/url.py

@@ -32,6 +32,10 @@ def urlretrieve(url, filename=None):
     else:
         debug('DOWNLOAD', '%s as %s', url, filename)
 
+    parent_directory = os.path.dirname(filename)
+    if not os.path.exists(parent_directory):
+        os.makedirs(parent_directory)
+
     if _urlretrieve is None:
         data = urllib2.urlopen(url).read()
         with open(filename, 'w') as f: