1
0

extract.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. import atexit
  2. from dataclasses import dataclass, field
  3. from distutils.version import LooseVersion
  4. import glob
  5. import json
  6. import os
  7. import re
  8. from pathlib import Path
  9. import shutil
  10. import stat
  11. import subprocess
  12. from typing import Dict, List, NamedTuple, Optional, Union
  13. from .config import Arch, PythonImpl, PythonVersion
  14. from ..utils.deps import ensure_excludelist, EXCLUDELIST
  15. from ..utils.log import debug, log
  16. @dataclass(frozen=True)
  17. class PythonExtractor:
  18. '''Python extractor from an extracted Manylinux image.'''
  19. arch: Arch
  20. '''Target architecture'''
  21. prefix: Path
  22. '''Target image path'''
  23. tag: str
  24. '''Python binary tag'''
  25. excludelist: Optional[Path] = None
  26. '''Exclude list for shared libraries.'''
  27. patchelf: Optional[Path] = None
  28. '''Patchelf executable.'''
  29. excluded: List[str] = field(init=False)
  30. '''Excluded shared libraries.'''
  31. impl: PythonImpl = field(init=False)
  32. '''Python implementation'''
  33. library_path: List[str] = field(init=False)
  34. '''Search paths for libraries (LD_LIBRARY_PATH)'''
  35. python_prefix: Path = field(init=False)
  36. '''Python installation prefix'''
  37. version: PythonVersion = field(init=False)
  38. '''Python version'''
  39. def __post_init__(self):
  40. # Locate Python installation.
  41. link = os.readlink(self.prefix / f'opt/python/{self.tag}')
  42. if not link.startswith('/'):
  43. raise NotImplementedError()
  44. object.__setattr__(self, 'python_prefix', self.prefix / link[1:])
  45. # Parse implementation and version.
  46. head, tail = Path(link).name.split('-', 1)
  47. if head == 'cpython':
  48. impl = PythonImpl.CPYTHON
  49. version = PythonVersion.from_str(tail)
  50. else:
  51. raise NotImplementedError()
  52. object.__setattr__(self, 'impl', impl)
  53. object.__setattr__(self, 'version', version)
  54. # Set libraries search path.
  55. paths = []
  56. if self.arch in (Arch.AARCH64, Arch.X86_64):
  57. paths.append(self.prefix / 'lib64')
  58. elif self.arch == Arch.I686:
  59. paths.append(self.prefix / 'lib')
  60. else:
  61. raise NotImplementedError()
  62. paths.append(self.prefix / 'usr/local/lib')
  63. ssl = glob.glob(str(self.prefix / 'opt/_internal/openssl-*'))
  64. if ssl:
  65. paths.append(Path(ssl[0]) / 'lib')
  66. mpdecimal = glob.glob(str(self.prefix / 'opt/_internal/mpdecimal-*'))
  67. if mpdecimal:
  68. paths.append(Path(mpdecimal[0]) / 'lib')
  69. object.__setattr__(self, 'library_path', paths)
  70. # Set excluded libraries.
  71. if self.excludelist:
  72. excludelist = Path(self.excludelist)
  73. else:
  74. ensure_excludelist()
  75. excludelist = Path(EXCLUDELIST)
  76. excluded = []
  77. with excludelist.open() as f:
  78. for line in f:
  79. line = line.strip()
  80. if line and not line.startswith('#'):
  81. excluded.append(line)
  82. object.__setattr__(self, 'excluded', excluded)
  83. # Set patchelf, if not provided.
  84. if self.patchelf is None:
  85. paths = (
  86. Path(__file__).parent / 'bin',
  87. Path.home() / '.local/bin'
  88. )
  89. for path in paths:
  90. patchelf = path / 'patchelf'
  91. if patchelf.exists():
  92. break
  93. else:
  94. raise NotImplementedError()
  95. object.__setattr__(self, 'patchelf', patchelf)
  96. else:
  97. assert(self.patchelf.exists())
  98. def extract(self, destination):
  99. '''Extract Python runtime.'''
  100. python = f'python{self.version.short()}'
  101. runtime = f'bin/{python}'
  102. packages = f'lib/{python}'
  103. pip = f'bin/pip{self.version.short()}'
  104. # Locate include files.
  105. include = glob.glob(str(self.python_prefix / 'include/*'))
  106. if include:
  107. include = Path(include[0]).name
  108. include = f'include/{include}'
  109. else:
  110. raise NotImplementedError()
  111. # Clone Python runtime.
  112. (destination / 'bin').mkdir(exist_ok=True, parents=True)
  113. shutil.copy(self.python_prefix / runtime, destination / runtime)
  114. short = Path(destination / f'bin/python{self.version.major}')
  115. short.unlink(missing_ok=True)
  116. short.symlink_to(python)
  117. short = Path(destination / 'bin/python')
  118. short.unlink(missing_ok=True)
  119. short.symlink_to(f'python{self.version.major}')
  120. # Clone pip wrapper.
  121. with open(self.python_prefix / pip) as f:
  122. f.readline() # Skip shebang.
  123. body = f.read()
  124. with open(destination / pip, 'w') as f:
  125. f.write('#! /bin/sh\n')
  126. f.write(' '.join((
  127. '"exec"',
  128. f'"$(dirname $(readlink -f ${0}))/{python}"',
  129. '"$0"',
  130. '"$@"\n'
  131. )))
  132. f.write(body)
  133. shutil.copymode(self.python_prefix / pip, destination / pip)
  134. short = Path(destination / f'bin/pip{self.version.major}')
  135. short.unlink(missing_ok=True)
  136. short.symlink_to(f'pip{self.version.short()}')
  137. short = Path(destination / 'bin/pip')
  138. short.unlink(missing_ok=True)
  139. short.symlink_to(f'pip{self.version.major}')
  140. # Clone Python packages.
  141. for folder in (packages, include):
  142. shutil.copytree(self.python_prefix / folder, destination / folder,
  143. symlinks=True, dirs_exist_ok=True)
  144. # Remove some clutters.
  145. shutil.rmtree(destination / packages / 'test', ignore_errors=True)
  146. for root, dirs, files in os.walk(destination / packages):
  147. root = Path(root)
  148. for d in dirs:
  149. if d == '__pycache__':
  150. shutil.rmtree(root / d, ignore_errors=True)
  151. for f in files:
  152. if f.endswith('.pyc'):
  153. (root / f).unlink()
  154. # Map binary dependencies.
  155. libs = self.ldd(self.python_prefix / f'bin/{python}')
  156. path = Path(self.python_prefix / f'{packages}/lib-dynload')
  157. for module in glob.glob(str(path / "*.so")):
  158. l = self.ldd(module)
  159. libs.update(l)
  160. # Copy and patch binary dependencies.
  161. libdir = destination / 'lib'
  162. for (name, src) in libs.items():
  163. dst = libdir / name
  164. shutil.copy(src, dst, follow_symlinks=True)
  165. # Some libraries are read-only, which prevents overriding the
  166. # destination directory. Below, we change the permission of
  167. # destination files to read-write (for the owner).
  168. mode = dst.stat().st_mode
  169. if not (mode & stat.S_IWUSR):
  170. mode = mode | stat.S_IWUSR
  171. dst.chmod(mode)
  172. self.set_rpath(dst, '$ORIGIN')
  173. # Patch RPATHs of binary modules.
  174. path = Path(destination / f'{packages}/lib-dynload')
  175. for module in glob.glob(str(path / "*.so")):
  176. src = Path(module)
  177. dst = os.path.relpath(libdir, src.parent)
  178. self.set_rpath(src, f'$ORIGIN/{dst}')
  179. # Patch RPATHs of Python runtime.
  180. src = destination / runtime
  181. dst = os.path.relpath(libdir, src.parent)
  182. self.set_rpath(src, f'$ORIGIN/{dst}')
  183. # Copy SSL certificates (i.e. clone certifi).
  184. certs = self.prefix / 'opt/_internal/certs.pem'
  185. if certs.is_symlink():
  186. dst = self.prefix / str(certs.readlink())[1:]
  187. certifi = dst.parent
  188. assert(certifi.name == 'certifi')
  189. site_packages = certifi.parent
  190. assert(site_packages.name == 'site-packages')
  191. for src in glob.glob(str(site_packages / 'certifi*')):
  192. src = Path(src)
  193. dst = destination / f'{packages}/site-packages/{src.name}'
  194. if not dst.exists():
  195. shutil.copytree(src, dst, symlinks=True)
  196. else:
  197. raise NotImplementedError()
  198. # Copy Tcl & Tk data.
  199. tcltk_src = self.prefix / 'usr/local/lib'
  200. tx_version = []
  201. for match in glob.glob(str(tcltk_src / 'tk*')):
  202. path = Path(match)
  203. if path.is_dir():
  204. tx_version.append(LooseVersion(path.name[2:]))
  205. tx_version.sort()
  206. tx_version = tx_version[-1]
  207. tcltk_dir = Path(destination / 'usr/share/tcltk')
  208. tcltk_dir.mkdir(exist_ok=True, parents=True)
  209. for tx in ('tcl', 'tk'):
  210. name = f'{tx}{tx_version}'
  211. src = tcltk_src / name
  212. dst = tcltk_dir / name
  213. shutil.copytree(src, dst, symlinks=True, dirs_exist_ok=True)
  214. def ldd(self, target: Path) -> Dict[str, Path]:
  215. '''Cross-platform implementation of ldd, using readelf.'''
  216. pattern = re.compile(r'[(]NEEDED[)]\s+Shared library:\s+\[([^\]]+)\]')
  217. dependencies = dict()
  218. def recurse(target: Path):
  219. result = subprocess.run(f'readelf -d {target}', shell=True,
  220. check=True, capture_output=True)
  221. stdout = result.stdout.decode()
  222. matches = pattern.findall(stdout)
  223. for match in matches:
  224. if (match not in dependencies) and (match not in self.excluded):
  225. path = self.locate_library(match)
  226. dependencies[match] = path
  227. subs = recurse(path)
  228. recurse(target)
  229. return dependencies
  230. def locate_library(self, name: str) -> Path:
  231. '''Locate a library given its qualified name.'''
  232. for dirname in self.library_path:
  233. path = dirname / name
  234. if path.exists():
  235. return path
  236. else:
  237. raise FileNotFoundError(name)
  238. def set_rpath(self, target, rpath):
  239. cmd = f'{self.patchelf} --print-rpath {target}'
  240. result = subprocess.run(cmd, shell=True, check=True,
  241. capture_output=True)
  242. current_rpath = result.stdout.decode().strip()
  243. if current_rpath != rpath:
  244. cmd = f"{self.patchelf} --set-rpath '{rpath}' {target}"
  245. subprocess.run(cmd, shell=True, check=True, capture_output=True)
  246. @dataclass(frozen=True)
  247. class ImageExtractor:
  248. '''Manylinux image extractor from layers.'''
  249. prefix: Path
  250. '''Manylinux image prefix.'''
  251. tag: Optional[str] = 'latest'
  252. '''Manylinux image tag.'''
  253. def default_destination(self):
  254. return self.prefix / f'extracted/{self.tag}'
  255. def extract(self, destination: Optional[Path]=None, *, cleanup=False):
  256. '''Extract Manylinux image.'''
  257. if destination is None:
  258. destination = self.default_destination()
  259. if cleanup:
  260. def cleanup(destination):
  261. shutil.rmtree(destination, ignore_errors=True)
  262. atexit.register(cleanup, destination)
  263. with open(self.prefix / f'tags/{self.tag}.json') as f:
  264. meta = json.load(f)
  265. layers = meta['layers']
  266. extracted = []
  267. extracted_file = destination / '.extracted'
  268. if destination.exists():
  269. clean_destination = True
  270. if extracted_file.exists():
  271. with extracted_file.open() as f:
  272. extracted = f.read().split(os.linesep)[:-1]
  273. for a, b in zip(layers, extracted):
  274. if a != b:
  275. break
  276. else:
  277. clean_destination = False
  278. if clean_destination:
  279. shutil.rmtree(destination, ignore_errors=True)
  280. for i, layer in enumerate(layers):
  281. try:
  282. if layer == extracted[i]:
  283. continue
  284. except IndexError:
  285. pass
  286. debug('EXTRACT', f'{layer}.tar.gz')
  287. filename = self.prefix / f'layers/{layer}.tar.gz'
  288. cmd = ''.join((
  289. f'trap \'chmod u+rw -R {destination}\' EXIT ; ',
  290. f'mkdir -p {destination} && ',
  291. f'tar -xzf {filename} -C {destination} && ',
  292. f'echo \'{layer}\' >> {extracted_file}'
  293. ))
  294. process = subprocess.run(f'/bin/bash -c "{cmd}"', shell=True,
  295. check=True, capture_output=True)