1
0

extract.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. from dataclasses import dataclass, field
  2. from distutils.version import LooseVersion
  3. import glob
  4. import json
  5. import os
  6. import re
  7. from pathlib import Path
  8. import shutil
  9. import stat
  10. import subprocess
  11. from typing import Dict, List, NamedTuple, Optional, Union
  12. from .config import Arch, PythonImpl, PythonVersion
  13. from ..utils.deps import ensure_excludelist, EXCLUDELIST
  14. from ..utils.log import debug, log
  15. @dataclass(frozen=True)
  16. class PythonExtractor:
  17. '''Python extractor from an extracted Manylinux image.'''
  18. arch: Arch
  19. '''Target architecture'''
  20. prefix: Path
  21. '''Target image path'''
  22. tag: str
  23. '''Python binary tag'''
  24. excludelist: Optional[Path] = None
  25. '''Exclude list for shared libraries.'''
  26. patchelf: Optional[Path] = None
  27. '''Patchelf executable.'''
  28. excluded: List[str] = field(init=False)
  29. '''Excluded shared libraries.'''
  30. impl: PythonImpl = field(init=False)
  31. '''Python implementation'''
  32. library_path: List[str] = field(init=False)
  33. '''Search paths for libraries (LD_LIBRARY_PATH)'''
  34. python_prefix: Path = field(init=False)
  35. '''Python installation prefix'''
  36. version: PythonVersion = field(init=False)
  37. '''Python version'''
  38. def __post_init__(self):
  39. # Locate Python installation.
  40. link = os.readlink(self.prefix / f'opt/python/{self.tag}')
  41. if not link.startswith('/'):
  42. raise NotImplementedError()
  43. object.__setattr__(self, 'python_prefix', self.prefix / link[1:])
  44. # Parse implementation and version.
  45. head, tail = Path(link).name.split('-', 1)
  46. if head == 'cpython':
  47. impl = PythonImpl.CPYTHON
  48. version = PythonVersion.from_str(tail)
  49. else:
  50. raise NotImplementedError()
  51. object.__setattr__(self, 'impl', impl)
  52. object.__setattr__(self, 'version', version)
  53. # Set libraries search path.
  54. paths = []
  55. if self.arch in (Arch.AARCH64, Arch.X86_64):
  56. paths.append(self.prefix / 'lib64')
  57. elif self.arch == Arch.I686:
  58. paths.append(self.prefix / 'lib')
  59. else:
  60. raise NotImplementedError()
  61. paths.append(self.prefix / 'usr/local/lib')
  62. ssl = glob.glob(str(self.prefix / 'opt/_internal/openssl-*'))
  63. if ssl:
  64. paths.append(Path(ssl[0]) / 'lib')
  65. object.__setattr__(self, 'library_path', paths)
  66. # Set excluded libraries.
  67. if self.excludelist:
  68. excludelist = Path(self.excludelist)
  69. else:
  70. ensure_excludelist()
  71. excludelist = Path(EXCLUDELIST)
  72. excluded = []
  73. with excludelist.open() as f:
  74. for line in f:
  75. line = line.strip()
  76. if line and not line.startswith('#'):
  77. excluded.append(line)
  78. object.__setattr__(self, 'excluded', excluded)
  79. # Set patchelf, if not provided.
  80. if self.patchelf is None:
  81. paths = (
  82. Path(__file__).parent / 'bin',
  83. Path.home() / '.local/bin'
  84. )
  85. for path in paths:
  86. patchelf = path / 'patchelf'
  87. if patchelf.exists():
  88. break
  89. else:
  90. raise NotImplementedError()
  91. object.__setattr__(self, 'patchelf', patchelf)
  92. else:
  93. assert(self.patchelf.exists())
  94. def extract(self, destination):
  95. '''Extract Python runtime.'''
  96. python = f'python{self.version.short()}'
  97. runtime = f'bin/{python}'
  98. packages = f'lib/{python}'
  99. pip = f'bin/pip{self.version.short()}'
  100. # Locate include files.
  101. include = glob.glob(str(self.python_prefix / 'include/*'))
  102. if include:
  103. include = Path(include[0]).name
  104. include = f'include/{include}'
  105. else:
  106. raise NotImplementedError()
  107. # Clone Python runtime.
  108. (destination / 'bin').mkdir(exist_ok=True, parents=True)
  109. shutil.copy(self.python_prefix / runtime, destination / runtime)
  110. short = Path(destination / f'bin/python{self.version.major}')
  111. short.unlink(missing_ok=True)
  112. short.symlink_to(python)
  113. short = Path(destination / 'bin/python')
  114. short.unlink(missing_ok=True)
  115. short.symlink_to(f'python{self.version.major}')
  116. # Clone pip wrapper.
  117. with open(self.python_prefix / pip) as f:
  118. f.readline() # Skip shebang.
  119. body = f.read()
  120. with open(destination / pip, 'w') as f:
  121. f.write('#! /bin/sh\n')
  122. f.write(' '.join((
  123. '"exec"',
  124. f'"$(dirname $(readlink -f ${0}))/{python}"',
  125. '"$0"',
  126. '"$@"\n'
  127. )))
  128. f.write(body)
  129. shutil.copymode(self.python_prefix / pip, destination / pip)
  130. short = Path(destination / f'bin/pip{self.version.major}')
  131. short.unlink(missing_ok=True)
  132. short.symlink_to(f'pip{self.version.short()}')
  133. short = Path(destination / 'bin/pip')
  134. short.unlink(missing_ok=True)
  135. short.symlink_to(f'pip{self.version.major}')
  136. # Clone Python packages.
  137. for folder in (packages, include):
  138. shutil.copytree(self.python_prefix / folder, destination / folder,
  139. symlinks=True, dirs_exist_ok=True)
  140. # Remove some clutters.
  141. shutil.rmtree(destination / packages / 'test', ignore_errors=True)
  142. for root, dirs, files in os.walk(destination / packages):
  143. root = Path(root)
  144. for d in dirs:
  145. if d == '__pycache__':
  146. shutil.rmtree(root / d, ignore_errors=True)
  147. for f in files:
  148. if f.endswith('.pyc'):
  149. (root / f).unlink()
  150. # Map binary dependencies.
  151. libs = self.ldd(self.python_prefix / f'bin/{python}')
  152. path = Path(self.python_prefix / f'{packages}/lib-dynload')
  153. for module in glob.glob(str(path / "*.so")):
  154. l = self.ldd(module)
  155. libs.update(l)
  156. # Copy and patch binary dependencies.
  157. libdir = destination / 'lib'
  158. for (name, src) in libs.items():
  159. dst = libdir / name
  160. shutil.copy(src, dst, follow_symlinks=True)
  161. # Some libraries are read-only, which prevents overriding the
  162. # destination directory. Below, we change the permission of
  163. # destination files to read-write (for the owner).
  164. mode = dst.stat().st_mode
  165. if not (mode & stat.S_IWUSR):
  166. mode = mode | stat.S_IWUSR
  167. dst.chmod(mode)
  168. self.set_rpath(dst, '$ORIGIN')
  169. # Patch RPATHs of binary modules.
  170. path = Path(destination / f'{packages}/lib-dynload')
  171. for module in glob.glob(str(path / "*.so")):
  172. src = Path(module)
  173. dst = os.path.relpath(libdir, src.parent)
  174. self.set_rpath(src, f'$ORIGIN/{dst}')
  175. # Patch RPATHs of Python runtime.
  176. src = destination / runtime
  177. dst = os.path.relpath(libdir, src.parent)
  178. self.set_rpath(src, f'$ORIGIN/{dst}')
  179. # Copy SSL certificates (i.e. clone certifi).
  180. certs = self.prefix / 'opt/_internal/certs.pem'
  181. if certs.is_symlink():
  182. dst = self.prefix / str(certs.readlink())[1:]
  183. certifi = dst.parent
  184. assert(certifi.name == 'certifi')
  185. site_packages = certifi.parent
  186. assert(site_packages.name == 'site-packages')
  187. for src in glob.glob(str(site_packages / 'certifi*')):
  188. src = Path(src)
  189. dst = destination / f'{packages}/site-packages/{src.name}'
  190. if not dst.exists():
  191. shutil.copytree(src, dst, symlinks=True)
  192. else:
  193. raise NotImplementedError()
  194. # Copy Tcl & Tk data.
  195. tcltk_src = self.prefix / 'usr/local/lib'
  196. tx_version = []
  197. for match in glob.glob(str(tcltk_src / 'tk*')):
  198. path = Path(match)
  199. if path.is_dir():
  200. tx_version.append(LooseVersion(path.name[2:]))
  201. tx_version.sort()
  202. tx_version = tx_version[-1]
  203. tcltk_dir = Path(destination / 'usr/share/tcltk')
  204. tcltk_dir.mkdir(exist_ok=True, parents=True)
  205. for tx in ('tcl', 'tk'):
  206. name = f'{tx}{tx_version}'
  207. src = tcltk_src / name
  208. dst = tcltk_dir / name
  209. shutil.copytree(src, dst, symlinks=True, dirs_exist_ok=True)
  210. def ldd(self, target: Path) -> Dict[str, Path]:
  211. '''Cross-platform implementation of ldd, using readelf.'''
  212. pattern = re.compile(r'[(]NEEDED[)]\s+Shared library:\s+\[([^\]]+)\]')
  213. dependencies = dict()
  214. def recurse(target: Path):
  215. result = subprocess.run(f'readelf -d {target}', shell=True,
  216. check=True, capture_output=True)
  217. stdout = result.stdout.decode()
  218. matches = pattern.findall(stdout)
  219. for match in matches:
  220. if (match not in dependencies) and (match not in self.excluded):
  221. path = self.locate_library(match)
  222. dependencies[match] = path
  223. subs = recurse(path)
  224. recurse(target)
  225. return dependencies
  226. def locate_library(self, name: str) -> Path:
  227. '''Locate a library given its qualified name.'''
  228. for dirname in self.library_path:
  229. path = dirname / name
  230. if path.exists():
  231. return path
  232. else:
  233. raise FileNotFoundError(name)
  234. def set_rpath(self, target, rpath):
  235. cmd = f'{self.patchelf} --print-rpath {target}'
  236. result = subprocess.run(cmd, shell=True, check=True,
  237. capture_output=True)
  238. current_rpath = result.stdout.decode().strip()
  239. if current_rpath != rpath:
  240. cmd = f"{self.patchelf} --set-rpath '{rpath}' {target}"
  241. subprocess.run(cmd, shell=True, check=True, capture_output=True)
  242. @dataclass(frozen=True)
  243. class ImageExtractor:
  244. '''Manylinux image extractor from layers.'''
  245. prefix: Path
  246. '''Manylinux image prefix.'''
  247. tag: Optional[str] = 'latest'
  248. '''Manylinux image tag.'''
  249. def extract(self, destination: Path):
  250. '''Extract Manylinux image.'''
  251. with open(self.prefix / f'tags/{self.tag}.json') as f:
  252. meta = json.load(f)
  253. layers = meta['layers']
  254. for layer in layers:
  255. debug('EXTRACT', f'{layer}.tar.gz')
  256. filename = self.prefix / f'layers/{layer}.tar.gz'
  257. cmd = ' && '.join((
  258. f'mkdir -p {destination}',
  259. f'tar -xzf {filename} -C {destination}',
  260. f'chmod u+rw -R {destination}'
  261. ))
  262. process = subprocess.run(cmd, shell=True, check=True,
  263. capture_output=True)