extract.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. import atexit
  2. from dataclasses import dataclass, field
  3. from distutils.version import LooseVersion
  4. import glob
  5. import json
  6. import os
  7. import re
  8. from pathlib import Path
  9. import shutil
  10. import stat
  11. import subprocess
  12. from typing import Dict, List, Optional
  13. from .config import Arch, PythonImpl, PythonVersion
  14. from ..appimage import Appifier
  15. from ..utils.deps import ensure_excludelist, ensure_patchelf, EXCLUDELIST, \
  16. PATCHELF
  17. from ..utils.log import debug, log
  18. @dataclass(frozen=True)
  19. class PythonExtractor:
  20. '''Python extractor from an extracted Manylinux image.'''
  21. arch: Arch
  22. '''Target architecture'''
  23. prefix: Path
  24. '''Target image path'''
  25. tag: str
  26. '''Python binary tag'''
  27. excludelist: Optional[Path] = None
  28. '''Exclude list for shared libraries.'''
  29. patchelf: Optional[Path] = None
  30. '''Patchelf executable.'''
  31. excluded: List[str] = field(init=False)
  32. '''Excluded shared libraries.'''
  33. impl: PythonImpl = field(init=False)
  34. '''Python implementation'''
  35. library_path: List[str] = field(init=False)
  36. '''Search paths for libraries (LD_LIBRARY_PATH)'''
  37. python_prefix: Path = field(init=False)
  38. '''Python installation prefix'''
  39. version: PythonVersion = field(init=False)
  40. '''Python version'''
  41. def __post_init__(self):
  42. # Locate Python installation.
  43. link = os.readlink(self.prefix / f'opt/python/{self.tag}')
  44. if not link.startswith('/'):
  45. raise NotImplementedError()
  46. object.__setattr__(self, 'python_prefix', self.prefix / link[1:])
  47. # Parse implementation and version.
  48. head, tail = Path(link).name.split('-', 1)
  49. if head == 'cpython':
  50. impl = PythonImpl.CPYTHON
  51. version = PythonVersion.from_str(tail)
  52. else:
  53. raise NotImplementedError()
  54. object.__setattr__(self, 'impl', impl)
  55. object.__setattr__(self, 'version', version)
  56. # Set libraries search path.
  57. paths = []
  58. if self.arch in (Arch.AARCH64, Arch.X86_64):
  59. paths.append(self.prefix / 'lib64')
  60. paths.append(self.prefix / 'usr/lib64')
  61. if self.arch == Arch.X86_64:
  62. paths.append(self.prefix / 'lib/x86_64-linux-gnu')
  63. paths.append(self.prefix / 'usr/lib/x86_64-linux-gnu')
  64. else:
  65. paths.append(self.prefix / 'lib/aarch64-linux-gnu')
  66. paths.append(self.prefix / 'usr/lib/aarch64-linux-gnu')
  67. elif self.arch == Arch.I686:
  68. paths.append(self.prefix / 'lib')
  69. paths.append(self.prefix / 'usr/lib')
  70. paths.append(self.prefix / 'lib/i386-linux-gnu')
  71. paths.append(self.prefix / 'usr/lib/i386-linux-gnu')
  72. else:
  73. raise NotImplementedError()
  74. paths.append(self.prefix / 'usr/local/lib')
  75. patterns = (
  76. 'curl-*',
  77. 'mpdecimal-*',
  78. 'openssl-*',
  79. 'sqlite*',
  80. )
  81. for pattern in patterns:
  82. pattern = str(self.prefix / f'opt/_internal/{pattern}/lib')
  83. for match in glob.glob(pattern):
  84. paths.append(Path(match))
  85. object.__setattr__(self, 'library_path', paths)
  86. # Set excluded libraries.
  87. if self.excludelist:
  88. excludelist = Path(self.excludelist)
  89. else:
  90. ensure_excludelist()
  91. excludelist = Path(EXCLUDELIST)
  92. excluded = set()
  93. with excludelist.open() as f:
  94. for line in f:
  95. line = line.strip()
  96. if line and not line.startswith('#'):
  97. excluded.add(line)
  98. excluded.add('ld-linux-aarch64.so.1') # patch for aarch64.
  99. object.__setattr__(self, 'excluded', excluded)
  100. # Set patchelf, if not provided.
  101. if self.patchelf is None:
  102. ensure_patchelf()
  103. object.__setattr__(self, 'patchelf', PATCHELF)
  104. else:
  105. assert(self.patchelf.exists())
  106. def extract(
  107. self,
  108. destination: Path,
  109. *,
  110. appify: Optional[bool]=False,
  111. python_prefix: Optional[str]=None,
  112. system_prefix: Optional[str]=None,
  113. ):
  114. '''Extract Python runtime.'''
  115. python = f'python{self.version.short()}'
  116. flavoured_python = f'python{self.version.flavoured()}'
  117. runtime = f'bin/{flavoured_python}'
  118. packages = f'lib/{flavoured_python}'
  119. pip = f'bin/pip{self.version.short()}'
  120. if python_prefix is None:
  121. python_prefix = f'opt/{flavoured_python}'
  122. if system_prefix is None:
  123. system_prefix = 'usr'
  124. python_dest = destination / python_prefix
  125. system_dest = destination / system_prefix
  126. # Locate include files.
  127. include = glob.glob(str(self.python_prefix / 'include/*'))
  128. if include:
  129. include = Path(include[0]).name
  130. include = f'include/{include}'
  131. else:
  132. raise NotImplementedError()
  133. # Clone Python runtime.
  134. log('CLONE',
  135. f'{python} from {self.python_prefix.relative_to(self.prefix)}')
  136. (python_dest / 'bin').mkdir(exist_ok=True, parents=True)
  137. shutil.copy(self.python_prefix / runtime, python_dest / runtime)
  138. # Clone pip wrapper.
  139. with open(self.python_prefix / pip) as f:
  140. f.readline() # Skip shebang.
  141. body = f.read()
  142. with open(python_dest / pip, 'w') as f:
  143. f.write('#! /bin/sh\n')
  144. f.write(' '.join((
  145. '"exec"',
  146. f'"$(dirname $(readlink -f ${0}))/{flavoured_python}"',
  147. '"$0"',
  148. '"$@"\n'
  149. )))
  150. f.write(body)
  151. shutil.copymode(self.python_prefix / pip, python_dest / pip)
  152. # Clone Python packages.
  153. for folder in (packages, include):
  154. shutil.copytree(self.python_prefix / folder, python_dest / folder,
  155. symlinks=True, dirs_exist_ok=True)
  156. # Remove some clutters.
  157. log('PRUNE', '%s packages', python)
  158. shutil.rmtree(python_dest / packages / 'test', ignore_errors=True)
  159. for root, dirs, files in os.walk(python_dest / packages):
  160. root = Path(root)
  161. for d in dirs:
  162. if d == '__pycache__':
  163. shutil.rmtree(root / d, ignore_errors=True)
  164. for f in files:
  165. if f.endswith('.pyc'):
  166. (root / f).unlink()
  167. # Map binary dependencies.
  168. libs = self.ldd(self.python_prefix / f'bin/{flavoured_python}')
  169. path = Path(self.python_prefix / f'{packages}/lib-dynload')
  170. for module in glob.glob(str(path / "*.so")):
  171. l = self.ldd(module)
  172. libs.update(l)
  173. # Copy and patch binary dependencies.
  174. libdir = system_dest / 'lib'
  175. libdir.mkdir(exist_ok=True, parents=True)
  176. for (name, src) in libs.items():
  177. dst = libdir / name
  178. shutil.copy(src, dst, follow_symlinks=True)
  179. # Some libraries are read-only, which prevents overriding the
  180. # destination directory. Below, we change the permission of
  181. # destination files to read-write (for the owner).
  182. mode = dst.stat().st_mode
  183. if not (mode & stat.S_IWUSR):
  184. mode = mode | stat.S_IWUSR
  185. dst.chmod(mode)
  186. self.set_rpath(dst, '$ORIGIN')
  187. # Patch RPATHs of binary modules.
  188. log('LINK', '%s C-extensions', python)
  189. path = Path(python_dest / f'{packages}/lib-dynload')
  190. for module in glob.glob(str(path / "*.so")):
  191. src = Path(module)
  192. dst = os.path.relpath(libdir, src.parent)
  193. self.set_rpath(src, f'$ORIGIN/{dst}')
  194. # Patch RPATHs of Python runtime.
  195. src = python_dest / runtime
  196. dst = os.path.relpath(libdir, src.parent)
  197. self.set_rpath(src, f'$ORIGIN/{dst}')
  198. # Copy SSL certificates (i.e. clone certifi).
  199. certs = self.prefix / 'opt/_internal/certs.pem'
  200. if certs.is_symlink():
  201. dst = self.prefix / str(certs.readlink())[1:]
  202. certifi = dst.parent
  203. assert(certifi.name == 'certifi')
  204. site_packages = certifi.parent
  205. assert(site_packages.name == 'site-packages')
  206. log('INSTALL', certifi.name)
  207. matches = [
  208. Path(src) for src in glob.glob(str(site_packages / 'certifi*'))
  209. ]
  210. matches = sorted(matches, key=lambda src: src.name)
  211. cert_src = None
  212. for src in matches:
  213. dst = python_dest / f'{packages}/site-packages/{src.name}'
  214. if not dst.exists():
  215. shutil.copytree(src, dst, symlinks=True)
  216. if cert_src is None:
  217. cacert_pem = dst / 'cacert.pem'
  218. if cacert_pem.exists():
  219. cert_src = cacert_pem
  220. assert(cert_src is not None)
  221. else:
  222. raise NotImplementedError()
  223. # Copy Tcl & Tk data.
  224. tx_version = []
  225. for match in glob.glob(str(system_dest / 'lib/libtk*')):
  226. path = system_dest / f'lib/{match}'
  227. tx_version.append(LooseVersion(path.name[5:8]))
  228. if tx_version:
  229. tx_version.sort()
  230. tx_version = tx_version[-1]
  231. for location in ('usr/local/lib', 'usr/share', 'usr/share/tcltk'):
  232. tcltk_src = self.prefix / location
  233. path = tcltk_src / f'tk{tx_version}'
  234. if path.exists() and path.is_dir():
  235. break
  236. else:
  237. raise ValueError(f'could not locate Tcl/Tk{tx_version}')
  238. log('INSTALL', f'Tcl/Tk{tx_version}')
  239. tcltk_dir = Path(system_dest / 'share/tcltk')
  240. tcltk_dir.mkdir(exist_ok=True, parents=True)
  241. for tx in ('tcl', 'tk'):
  242. name = f'{tx}{tx_version}'
  243. src = tcltk_src / name
  244. dst = tcltk_dir / name
  245. shutil.copytree(src, dst, symlinks=True, dirs_exist_ok=True)
  246. if appify:
  247. appifier = Appifier(
  248. appdir = str(destination),
  249. appdir_bin = str(system_dest / 'bin'),
  250. python_bin = str(python_dest / 'bin'),
  251. python_pkg = str(python_dest / packages),
  252. version = self.version,
  253. tk_version = tx_version,
  254. cert_src = cert_src
  255. )
  256. appifier.appify()
  257. def ldd(self, target: Path) -> Dict[str, Path]:
  258. '''Cross-platform implementation of ldd, using readelf.'''
  259. pattern = re.compile(r'[(]NEEDED[)]\s+Shared library:\s+\[([^\]]+)\]')
  260. dependencies = dict()
  261. def recurse(target: Path):
  262. result = subprocess.run(f'readelf -d {target}', shell=True,
  263. check=True, capture_output=True)
  264. stdout = result.stdout.decode()
  265. matches = pattern.findall(stdout)
  266. for match in matches:
  267. if (match not in dependencies) and (match not in self.excluded):
  268. path = self.locate_library(match)
  269. dependencies[match] = path
  270. recurse(path)
  271. recurse(target)
  272. return dependencies
  273. def locate_library(self, name: str) -> Path:
  274. '''Locate a library given its qualified name.'''
  275. for dirname in self.library_path:
  276. path = dirname / name
  277. if path.exists():
  278. return path
  279. else:
  280. raise FileNotFoundError(name)
  281. def set_rpath(self, target, rpath):
  282. cmd = f'{self.patchelf} --print-rpath {target}'
  283. result = subprocess.run(cmd, shell=True, check=True,
  284. capture_output=True)
  285. current_rpath = result.stdout.decode().strip()
  286. if current_rpath != rpath:
  287. cmd = f"{self.patchelf} --set-rpath '{rpath}' {target}"
  288. subprocess.run(cmd, shell=True, check=True, capture_output=True)
  289. @dataclass(frozen=True)
  290. class ImageExtractor:
  291. '''Manylinux image extractor from layers.'''
  292. prefix: Path
  293. '''Manylinux image prefix.'''
  294. tag: Optional[str] = 'latest'
  295. '''Manylinux image tag.'''
  296. def default_destination(self):
  297. return self.prefix / f'extracted/{self.tag}'
  298. def extract(self, destination: Optional[Path]=None, *, clean=False):
  299. '''Extract Manylinux image.'''
  300. if destination is None:
  301. destination = self.default_destination()
  302. if clean:
  303. def clean(destination):
  304. shutil.rmtree(destination, ignore_errors=True)
  305. atexit.register(clean, destination)
  306. log('EXTRACT', f'{self.prefix.name}:{self.tag}')
  307. with open(self.prefix / f'tags/{self.tag}.json') as f:
  308. meta = json.load(f)
  309. layers = meta['layers']
  310. extracted = []
  311. extracted_file = destination / '.extracted'
  312. if destination.exists():
  313. clean_destination = True
  314. if extracted_file.exists():
  315. with extracted_file.open() as f:
  316. extracted = f.read().split(os.linesep)[:-1]
  317. for a, b in zip(layers, extracted):
  318. if a != b:
  319. break
  320. else:
  321. clean_destination = False
  322. if clean_destination:
  323. shutil.rmtree(destination, ignore_errors=True)
  324. for i, layer in enumerate(layers):
  325. try:
  326. if layer == extracted[i]:
  327. continue
  328. except IndexError:
  329. pass
  330. debug('EXTRACT', f'{layer}.tar.gz')
  331. filename = self.prefix / f'layers/{layer}.tar.gz'
  332. cmd = ''.join((
  333. f'trap \'chmod u+rw -R {destination}\' EXIT ; ',
  334. f'mkdir -p {destination} && ',
  335. f'tar -xzf {filename} --exclude=dev -C {destination} && ',
  336. f'echo \'{layer}\' >> {extracted_file}'
  337. ))
  338. r = subprocess.run(f'/bin/bash -c "{cmd}"', shell=True,
  339. capture_output=True)
  340. if r.returncode != 0:
  341. raise ValueError(r.stderr.decode())