From edd83a408d4ebe959596efe35743fa7c0d185ff1 Mon Sep 17 00:00:00 2001 From: Owen W. Taylor Date: Jan 31 2018 10:21:01 +0000 Subject: resolve-files.py: Piggy-back our files map off of fedmod metadata Instead of having a separate download process, use the metadata that fedmod is already downloading, and just add the cached file => download map in the download directory. --- diff --git a/resolve-files.py b/resolve-files.py index bfa87f3..e8f82f1 100755 --- a/resolve-files.py +++ b/resolve-files.py @@ -1,17 +1,22 @@ #!/usr/bin/python3 from functools import cmp_to_key -import dnf import gzip import hashlib import os import pickle import re +import rpm import sys +import xml.etree.ElementTree as ET +import xml.sax -REPO_F27 = "https://mirrors.fedoraproject.org/metalink?repo=fedora-27&arch=x86_64" -REPO_F27_UPDATES = "https://mirrors.fedoraproject.org/metalink?repo=updates-released-f27&arch=x86_64" -REPO_F27_UPDATES_TESTING = "https://mirrors.fedoraproject.org/metalink?repo=updates-testing-f27&arch=x86_64" +XDG_CACHE_HOME = os.environ.get("XDG_CACHE_HOME") or os.path.expanduser("~/.cache") + +# This needs to be in sync with fedmod +REPOS = [ + "f28-packages" +] ignore = set() rename = dict() @@ -272,71 +277,146 @@ platform_package_ignore_patterns = [ platform_package_ignore_compiled = [re.compile(p) for p in platform_package_ignore_patterns] -def package_cmp(a, b): - if a.arch == 'i686' and b.arch != 'i686': +# We need to look up a lot of file dependencies. dnf/libsolv is not fast at doing +# this (at least when we look up files one-by-one) so we create a hash table that +# maps from *all* files in the distribution to the "best" package that provides +# the file. To further speed this up, we pickle the result and store it, and only +# recreate it when the DNF metadata changes. We gzip the pickle to save space +# (70M instead of 700M), this slows things down by about 2 seconds. +# + +def package_cmp(p1, p2): + n1, e1, v1, r1, a1 = p1 + n2, e2, v2, r2, a2 = p2 + + if a1 == 'i686' and a2 != 'i686': return 1 - if a.arch != 'i686' and b.arch == 'i686': + if a1 != 'i686' and a2 == 'i686': return -1 - c = - a.evr_cmp(b) - if c < 0: + + if n1 < n2: return -1 - elif c == 0: - return 0 - else: + elif n1 > n2: return 1 -class Progress(dnf.callback.DownloadProgress): - def __init__(self, name): - self.name = name - self.hashes = -1 - def start(self, total_files, total_size, total_drpms=0): + if e1 is None: + e1 = '0' + if e2 is None: + e2 = '0' + + return - rpm.labelCompare((e1, v1, r1), (e2, v2, r2)) + +class FilesMapHandler(xml.sax.handler.ContentHandler): + def __init__(self, files_map): + self.files_map = files_map + self.name = None + self.arch = None + self.epoch = None + self.version = None + self.release = None + self.file = None + + def startElement(self, name, attrs): + if name == 'package': + self.name = attrs['name'] + self.arch = attrs['arch'] + elif name == 'version': + if self.name is not None: + self.epoch = attrs['epoch'] + self.version = attrs['ver'] + self.release = attrs['rel'] + elif name == 'file': + self.file = '' + + def endElement(self, name): + if name == 'package': + self.name = None + elif name == 'file': + package = (self.name, self.epoch, self.version, self.release, self.arch) + old = self.files_map.get(self.file) + if old is None or package_cmp(package, old) < 0: + self.files_map[self.file] = package + + self.file = None + + def characters(self, content): + if self.file is not None: + self.file += content + +def scan_filelists(filelists_path, files_map): + handler = FilesMapHandler(files_map) + with gzip.open(filelists_path, 'rb') as f: + xml.sax.parse(f, handler) + + +def make_files_map(repo_info): + files_map = {} + + for repo in REPOS: + start("Scanning files for {}".format(repo)) + repo_dir, repomd_contents = repo_info[repo] + root = ET.fromstring(repomd_contents) + + ns = {'repo': 'http://linux.duke.edu/metadata/repo'} + filelists_location = root.find("./repo:data[@type='filelists']/repo:location", ns).attrib['href'] + filelists_path = os.path.join(repo_dir, filelists_location) + if os.path.commonprefix([filelists_path, repo_dir]) != repo_dir: + done() + error("{}: filelists directory is outside of repository".format(repo_dir)) + + scan_filelists(filelists_path, files_map) + done() + + start("Finalizing files map") + for k in files_map: + files_map[k] = files_map[k][0] + done() + + return files_map + + +def get_files_map(): + hash_text = '' + repos_dir = os.path.join(XDG_CACHE_HOME, "fedmod/repos") + repo_info = {} + for repo in REPOS: + repo_dir = os.path.join(repos_dir, repo, 'x86_64') + repomd_path = os.path.join(repo_dir, 'repodata/repomd.xml') + try: + with open(repomd_path, 'rb') as f: + repomd_contents = f.read() + except (OSError, IOError): + print("Cannot read {}, try 'fedmod fetch-metadata'".format(repomd_path), file=sys.stderr) + sys.exit(1) + + repo_info[repo] = (repo_dir, repomd_contents) + hash_text += '{}|{}\n'.format(repo, hashlib.sha256(repomd_contents).hexdigest()) + + repo_hash = hashlib.sha256(hash_text.encode("UTF-8")).hexdigest() + + files_map_path = os.path.join(XDG_CACHE_HOME, "fedmod/flatpak-runtime-files-map.gz") + + try: + with gzip.open(files_map_path, 'rb') as f: + old_repo_hash = f.read(64).decode('utf-8') + if old_repo_hash == repo_hash: + start("Reading cached file map") + files_map = pickle.load(f) + done() + + return files_map + except FileNotFoundError: pass - def progress(self, payload, done): - if payload.download_size == 0: - return - hashes = int(60 * done / payload.download_size) - if hashes != self.hashes: - hashline = "[" + "#" * hashes + " " * (60 - hashes) + "]" - if self.hashes != -1: - print("\r", end="") - print("{} - {}k - {}".format(self.name, int(payload.download_size / 1024), hashline), end="") - self.hashes = hashes - def end(self, payload, status, message): - if status == dnf.callback.STATUS_OK: - print(" - DONE") - else: - print - -class PackageInfo(object): - def __init__(self): - self.base = dnf.Base() - - self._add_repo(self.base, 'f27', metalink=REPO_F27) - self._add_repo(self.base, 'f27-updates', metalink=REPO_F27_UPDATES) - self._add_repo(self.base, 'f27-updates-testing', metalink=REPO_F27_UPDATES_TESTING) - - self.base.fill_sack(load_available_repos=True, load_system_repo=False) - - def _add_repo(self, base, reponame, repourl=None, metalink=None): - repo = dnf.repo.Repo(reponame, self.base.conf) - if repourl is not None: - repo.baseurl = repourl - elif metalink is not None: - repo.metalink = metalink - else: - raise RuntimeError("Either baseurl or metalink must be specified") - repo.set_progress_bar(Progress(reponame)) - repo.load() - repo.enable() - base.repos.add(repo) - - def repo_hash(self): - h = hashlib.sha256() - for r in sorted(self.base.repos.all(), key=lambda x: x.name): - repomd_file = os.path.join(r._cachedir, 'repodata/repomd.xml') - with open(repomd_file, 'rb') as f: - h.update(f.read()) - return h.hexdigest() + + files_map = make_files_map(repo_info) + + start("Writing file map to cache") + with gzip.open(files_map_path, 'wb') as f: + f.write(repo_hash.encode('utf-8')) + pickle.dump(files_map, f) + done() + + return files_map if len(sys.argv) != 2: print("Usage: resolve-files.py INFILE", file=sys.stderr) @@ -354,6 +434,10 @@ is_sdk = "-Sdk" in base_path def warn(msg): print("{}: \033[31m{}\033[39m".format(inpath, msg), file=sys.stderr) +def error(msg): + print("{}: \033[31m{}\033[39m".format(inpath, msg), file=sys.stderr) + sys.exit(1) + def start(msg): print("{}: \033[90m{} ... \033[39m".format(inpath, msg), file=sys.stderr, end="") sys.stderr.flush() @@ -375,52 +459,7 @@ to_resolve.sort() done() -pkgs = PackageInfo() - -files_map = None - -# We need to look up a lot of file dependencies. dnf/libsolv is not fast at doing -# this (at least when we look up files one-by-one) so we create a hash table that -# maps from *all* files in the distribution to the "best" package that provides -# the file. To further speed this up, we pickle the result and store it, and only -# recreate it when the DNF metadata changes. We gzip the pickle to save space -# (70M instead of 700M), this slows things down by about 2 seconds. -# -repo_hash = pkgs.repo_hash() -f = None -try: - f = gzip.open('cache/files.map.gz', 'rb') - with f: - old_repo_hash = f.read(64).decode('utf-8') - if old_repo_hash == repo_hash: - start("Reading cached file map") - files_map = pickle.load(f) - done() -except FileNotFoundError: - pass -except (pickle.UnpicklingError, EOFError) as e: - done() - warn("Failed to load cache/files.map.gz:", e) - -if files_map is None: - start("Creating file map") - files_map = {} - for p in pkgs.base.sack.query().filter(): - for f in p.files: - old = files_map.get(f, None) - if old is None or package_cmp(p, old) < 0: - files_map[f] = p - - for k, v in files_map.items(): - files_map[k] = v.name - done() - - start("Writing file map to cache") - with gzip.open('cache/files.map.gz', 'wb') as f: - f.write(repo_hash.encode('utf-8')) - pickle.dump(files_map, f) - done() - +files_map = get_files_map() found_packages = set() start("Resolving files to packages")