#!/usr/bin/env python3 # -*- encoding: utf-8 -*- # Ignore the invalid snake-case error for the module name. # pylint: disable=invalid-name # Copyright (C) 2012-2020 Xyne # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # (version 2) as published by the Free Software Foundation. # # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ''' Retrieve Arch Linux mirrors. ''' import argparse import base64 import calendar import datetime import http.client import itertools import json import logging import os import pipes import queue import re import shlex import socket import subprocess import sys import tempfile import threading import time import urllib.error import urllib.request # -------------------------------- Constants --------------------------------- # NAME = 'Reflector' URL = 'https://www.archlinux32.org/mirrors/status/json' DISPLAY_TIME_FORMAT = '%Y-%m-%d %H:%M:%S UTC' PARSE_TIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ' PARSE_TIME_FORMAT_WITH_USEC = '%Y-%m-%dT%H:%M:%S.%fZ' DB_SUBPATH = 'i686/core/core.db' MIRROR_URL_FORMAT = '{0}{2}/{1}' MIRRORLIST_ENTRY_FORMAT = "Server = " + MIRROR_URL_FORMAT + "\n" DEFAULT_CONNECTION_TIMEOUT = 5 DEFAULT_CACHE_TIMEOUT = 300 DEFAULT_N_THREADS = os.cpu_count() SORT_TYPES = { 'age': 'last server synchronization', 'rate': 'download rate', 'country': 'server\'s location', 'score': 'MirrorStatus score', 'delay': 'MirrorStatus delay', } # ------------------------------- IO Functions ------------------------------- # def get_cache_file(name='mirrorstatus.json'): ''' Get a nearly XDG-compliant cache directory. PyXDG is not used to avoid the external dependency. It is not fully compliant because it omits the application name, but the mirror status file can be reused by other applications and this stores no other files. ''' cache_dir = os.getenv('XDG_CACHE_HOME', default=os.path.expanduser('~/.cache')) path = os.path.join(cache_dir, name) try: os.makedirs(os.path.dirname(path), exist_ok=True) # Raised by makedirs if permissions do not match umask except FileExistsError: pass return path def get_mirrorstatus( connection_timeout=DEFAULT_CONNECTION_TIMEOUT, cache_timeout=DEFAULT_CACHE_TIMEOUT, url=URL ): ''' Retrieve the mirror status JSON object. The downloaded data will be cached locally and re-used within the cache timeout period. Returns the object and the local cache's modification time. ''' if url == URL: cache_path = get_cache_file() else: name = base64.urlsafe_b64encode(url.encode('utf-8')).decode('utf-8') + '.json' name = os.path.join(NAME, name) cache_path = get_cache_file(name=name) try: mtime = os.path.getmtime(cache_path) invalid = (time.time() - mtime) > cache_timeout except FileNotFoundError: mtime = None invalid = True try: if invalid: with urllib.request.urlopen(url, None, connection_timeout) as handle: obj = json.loads(handle.read().decode()) with open(cache_path, 'w') as handle: json.dump(obj, handle, sort_keys=True, indent=2) mtime = time.time() else: with open(cache_path, 'r') as handle: obj = json.load(handle) return obj, mtime except (IOError, urllib.error.URLError, socket.timeout) as err: raise MirrorStatusError(f'failed to retrieve mirrorstatus data: {err.__class__.__name__}: {err}') # ------------------------------ Miscellaneous ------------------------------- # def get_logger(): ''' Get the logger used by this module. Use this to be sure that the right logger is used. ''' return logging.getLogger(NAME) def format_last_sync(mirrors): ''' Parse and format the "last_sync" field. ''' for mirror in mirrors: last_sync = calendar.timegm(time.strptime(mirror['last_sync'], PARSE_TIME_FORMAT)) mirror.update(last_sync=last_sync) yield mirror def count_countries(mirrors): ''' Count the mirrors in each country. ''' countries = dict() for mirror in mirrors: k = (mirror['country'], mirror['country_code']) if not any(k): continue try: countries[k] += 1 except KeyError: countries[k] = 1 return countries # --------------------------------- Sorting ---------------------------------- # def sort(mirrors, by=None, n_threads=DEFAULT_N_THREADS): # pylint: disable=invalid-name ''' Sort mirrors by different criteria. ''' # Ensure that "mirrors" is a list that can be sorted. if not isinstance(mirrors, list): mirrors = list(mirrors) if by == 'age': mirrors.sort(key=lambda m: m['last_sync'], reverse=True) elif by == 'rate': rates = rate(mirrors, n_threads=n_threads) mirrors = sorted(mirrors, key=lambda m: rates[m['url']], reverse=True) else: try: mirrors.sort(key=lambda m: m[by]) except KeyError: raise MirrorStatusError('attempted to sort mirrors by unrecognized criterion: "{}"'.format(by)) return mirrors # ---------------------------------- Rating ---------------------------------- # def rate_rsync(db_url, connection_timeout=DEFAULT_CONNECTION_TIMEOUT): ''' Download a database via rsync and return the time and rate of the download. ''' rsync_cmd = [ 'rsync', '-avL', '--no-h', '--no-motd', '--contimeout={}'.format(connection_timeout), db_url ] try: with tempfile.TemporaryDirectory() as tmpdir: time_0 = time.time() subprocess.check_call( rsync_cmd + [tmpdir], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ) time_delta = time.time() - time_0 size = os.path.getsize( os.path.join(tmpdir, os.path.basename(DB_SUBPATH)) ) ratio = size / time_delta return time_delta, ratio except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError): return 0, 0 def rate_http(db_url, connection_timeout=DEFAULT_CONNECTION_TIMEOUT): ''' Download a database via any protocol supported by urlopen and return the time and rate of the download. ''' req = urllib.request.Request(url=db_url) try: with urllib.request.urlopen(req, None, connection_timeout) as handle: time_0 = time.time() size = len(handle.read()) time_delta = time.time() - time_0 ratio = size / time_delta return time_delta, ratio except (OSError, urllib.error.HTTPError, http.client.HTTPException): return 0, 0 def rate(mirrors, n_threads=DEFAULT_N_THREADS, connection_timeout=DEFAULT_CONNECTION_TIMEOUT): ''' Rate mirrors by timing the download the core repo's database for each one. ''' # Ensure that mirrors is not a generator so that its length can be determined. if not isinstance(mirrors, tuple): mirrors = tuple(mirrors) if not mirrors: return None # At least 1 thread and not more than the number of mirrors. n_threads = max(1, min(n_threads, len(mirrors))) # URL input queue. q_in = queue.Queue() # URL, elapsed time and rate output queue. q_out = queue.Queue() def worker(): while True: # To stop a thread, an integer will be inserted in the input queue. Each # thread will increment it and re-insert it until it equals the # threadcount. After encountering the integer, the thread exits the loop. url = q_in.get() if isinstance(url, int): if url < n_threads: q_in.put(url + 1) else: db_url = url + DB_SUBPATH scheme = urllib.parse.urlparse(url).scheme if scheme == 'rsync': time_delta, ratio = rate_rsync(db_url, connection_timeout) else: time_delta, ratio = rate_http(db_url, connection_timeout) q_out.put((url, time_delta, ratio)) q_in.task_done() workers = tuple(threading.Thread(target=worker) for _ in range(n_threads)) for wkr in workers: wkr.daemon = True wkr.start() url_len = max(len(m['url']) for m in mirrors) logger = get_logger() for mir in mirrors: url = mir['url'] logger.info('rating %s', url) q_in.put(url) # To exit the threads. q_in.put(0) q_in.join() header_fmt = '{{:{:d}s}} {{:>14s}} {{:>9s}}'.format(url_len) logger.info(header_fmt.format('Server', 'Rate', 'Time')) fmt = '{{:{:d}s}} {{:8.2f}} KiB/s {{:7.2f}} s'.format(url_len) # Loop over the mirrors just to ensure that we get the rate for each mirror. # The value in the loop does not (necessarily) correspond to the mirror. rates = dict() for _ in mirrors: url, dtime, ratio = q_out.get() kibps = ratio / 1024.0 logger.info(fmt.format(url, kibps, dtime)) rates[url] = ratio q_out.task_done() return rates # ---------------------------- MirrorStatusError ----------------------------- # class MirrorStatusError(Exception): ''' Common base exception raised by this module. ''' def __init__(self, msg): super(MirrorStatusError, self).__init__() self.msg = msg def __str__(self): return repr(self.msg) # ---------------------------- MirrorStatusFilter ---------------------------- # class MirrorStatusFilter(): # pylint: disable=too-many-instance-attributes ''' Filter mirrors by different criteria. ''' def __init__( self, min_completion_pct=1.0, countries=None, protocols=None, include=None, exclude=None, age=None, isos=False, ipv4=False, ipv6=False ): self.min_completion_pct = min_completion_pct self.countries = tuple(c.upper() for c in countries) if countries else tuple() self.protocols = protocols self.include = tuple(re.compile(r) for r in include) if include else tuple() self.exclude = tuple(re.compile(r) for r in exclude) if exclude else tuple() self.age = age self.isos = isos self.ipv4 = ipv4 self.ipv6 = ipv6 def filter_mirrors(self, mirrors): ''' Filter the mirrors. ''' # Filter unsynced mirrors. mirrors = (m for m in mirrors if m['last_sync']) # Parse the last sync time. mirrors = format_last_sync(mirrors) # Filter by completion "percent" [0-1]. mirrors = (m for m in mirrors if m['completion_pct'] >= self.min_completion_pct) # Filter by countries. if self.countries: mirrors = ( m for m in mirrors if m['country'].upper() in self.countries or m['country_code'].upper() in self.countries ) # Filter by protocols. if self.protocols: mirrors = (m for m in mirrors if m['protocol'] in self.protocols) # Filter by include expressions. if self.include: mirrors = (m for m in mirrors if any(r.search(m['url']) for r in self.include)) # Filter by exclude expressions. if self.exclude: mirrors = (m for m in mirrors if not any(r.search(m['url']) for r in self.exclude)) # Filter by age. The age is given in hours and converted to seconds. Servers # with a last refresh older than the age are omitted. if self.age and self.age > 0: tim = time.time() age = self.age * 60**2 mirrors = (m for m in mirrors if (m['last_sync'] + age) >= tim) # Filter by ISO hosing. if self.isos: mirrors = (m for m in mirrors if m['isos']) # Filter by IPv4 support. if self.ipv4: mirrors = (m for m in mirrors if m['ipv4']) # Filter by IPv6 support. if self.ipv6: mirrors = (m for m in mirrors if m['ipv6']) yield from mirrors # -------------------------------- Formatting -------------------------------- # def format_mirrorlist(mirror_status, mtime, include_country=False, command=None, url=URL): ''' Format the mirrorlist. ''' if command is None: command = '?' else: command = 'reflector ' + ' '.join(pipes.quote(x) for x in command) last_check = mirror_status['last_check'] # For some reason the "last_check" field included microseconds. try: parsed_last_check = datetime.datetime.strptime( last_check, PARSE_TIME_FORMAT_WITH_USEC, ).timetuple() except ValueError: parsed_last_check = datetime.datetime.strptime( last_check, PARSE_TIME_FORMAT, ).timetuple() width = 80 colw = 11 header = '# Arch Linux mirrorlist generated by Reflector #'.center(width, '#') border = '#' * len(header) mirrorlist = f'{border}\n{header}\n{border}\n\n' + \ '\n'.join( '# {{:<{:d}s}} {{}}'.format(colw).format(k, v) for k, v in ( ('With:', command), ('When:', time.strftime(DISPLAY_TIME_FORMAT, time.gmtime())), ('From:', url), ('Retrieved:', time.strftime(DISPLAY_TIME_FORMAT, time.gmtime(mtime))), ('Last Check:', time.strftime(DISPLAY_TIME_FORMAT, parsed_last_check)) ) ) + \ '\n\n' country = None mirrors = mirror_status['urls'] for mirror in mirrors: # Include country tags. This is intended for lists that are sorted by # country. if include_country: ctry = '{} [{}]'.format(mirror['country'], mirror['country_code']) if ctry != country: if country: mirrorlist += '\n' mirrorlist += '# {}\n'.format(ctry) country = ctry mirrorlist += MIRRORLIST_ENTRY_FORMAT.format(mirror['url'], '$repo', '$arch') if mirrors: return mirrorlist return None # -------------------------- MirrorStatus Retriever -------------------------- # class MirrorStatus(): ''' This is a legacy class that will likely be removed in the future. It previously held most of this module's functionality until it was refactored into more modular functions. Seemingly pointless code is still used by importers of this module. ''' # TODO: move these to another module or remove them completely # Related: https://bugs.archlinux.org/task/32895 REPOSITORIES = ( 'community', 'community-staging', 'community-testing', 'core', 'extra', 'staging', 'testing' ) # Officially supported system architectures. ARCHITECTURES = ['i486','i686','pentium4'] MIRROR_URL_FORMAT = MIRROR_URL_FORMAT MIRRORLIST_ENTRY_FORMAT = MIRRORLIST_ENTRY_FORMAT def __init__( self, connection_timeout=DEFAULT_CONNECTION_TIMEOUT, cache_timeout=DEFAULT_CACHE_TIMEOUT, min_completion_pct=1.0, threads=DEFAULT_N_THREADS, url=URL ): self.connection_timeout = connection_timeout self.cache_timeout = cache_timeout self.min_completion_pct = min_completion_pct self.threads = threads self.url = url self.mirror_status = None self.ms_mtime = 0 def retrieve(self): ''' Retrieve the mirrorstatus data. ''' self.mirror_status, self.ms_mtime = get_mirrorstatus( connection_timeout=self.connection_timeout, cache_timeout=self.cache_timeout, url=self.url ) def get_obj(self): ''' Get the JSON mirror status. ''' t = time.time() # pylint: disable=invalid-name if (t - self.ms_mtime) > self.cache_timeout: self.retrieve() return self.mirror_status def get_mirrors(self): ''' Get the mirror from the mirror status. ''' obj = self.get_obj() try: return obj['urls'] except KeyError: raise MirrorStatusError('no mirrors detected in mirror status output') def filter(self, mirrors=None, **kwargs): ''' Filter mirrors by various criteria. ''' if mirrors is None: mirrors = self.get_mirrors() msf = MirrorStatusFilter(min_completion_pct=self.min_completion_pct, **kwargs) yield from msf.filter_mirrors(mirrors) def sort(self, mirrors=None, **kwargs): ''' Sort mirrors by various criteria. ''' if mirrors is None: mirrors = self.get_mirrors() yield from sort(mirrors, n_threads=self.threads, **kwargs) def rate(self, mirrors=None, **kwargs): ''' Sort mirrors by download speed. ''' if mirrors is None: mirrors = self.get_mirrors() yield from sort(mirrors, n_threads=self.threads, by='rate', **kwargs) def get_mirrorlist(self, mirrors=None, include_country=False, cmd=None): ''' Get a Pacman-formatted mirrorlist. ''' obj = self.get_obj().copy() if mirrors is not None: if not isinstance(mirrors, list): mirrors = list(mirrors) obj['urls'] = mirrors return format_mirrorlist(obj, self.ms_mtime, include_country=include_country, command=cmd, url=self.url) def list_countries(self): ''' List countries along with a server count for each one. ''' mirrors = self.get_mirrors() return count_countries(mirrors) # ----------------------------- argparse Actions ----------------------------- # class ListCountries(argparse.Action): ''' Action to list countries along with the number of mirrors in each. ''' def __call__(self, parser, namespace, values, option_string=None): ms = MirrorStatus(url=namespace.url) # pylint: disable=invalid-name countries = ms.list_countries() width = max(len(c) for c, cc in countries) number = len(str(max(countries.values()))) fmt = '{{:{:d}s}} {{}} {{:{:d}d}}'.format(width, number) for (ctry, count), nmbr in sorted(countries.items(), key=lambda x: x[0][0]): print(fmt.format(ctry, count, nmbr)) sys.exit(0) def print_mirror_info(mirrors, time_fmt=DISPLAY_TIME_FORMAT): ''' Print information about each mirror to STDOUT. ''' if mirrors: # mirrors = format_last_sync(mirrors) if not isinstance(mirrors, list): mirrors = list(mirrors) keys = sorted(k for k in mirrors[0].keys() if k != 'url') length = max(len(k) for k in keys) fmt = '{{:{:d}s}} : {{}}'.format(length) for mir in mirrors: print('{}$arch/$repo'.format(mir['url'])) for key in keys: value = mir[key] if key == 'last_sync': value = time.strftime(time_fmt, time.gmtime(value)) print(fmt.format(key, value)) print() def add_arguments(parser): ''' Add reflector arguments to the argument parser. ''' parser.add_argument( '--connection-timeout', type=int, metavar='n', default=DEFAULT_CONNECTION_TIMEOUT, help='The number of seconds to wait before a connection times out. Default: %(default)s' ) # parser.add_argument( # '--download-timeout', type=int, metavar='n', # help='The number of seconds to wait before a download times out. The threshold is checked after each chunk is read, so the actual timeout may take longer.' # ) parser.add_argument( '--list-countries', action=ListCountries, nargs=0, help='Display a table of the distribution of servers by country.' ) parser.add_argument( '--cache-timeout', type=int, metavar='n', default=DEFAULT_CACHE_TIMEOUT, help='The cache timeout in seconds for the data retrieved from the Arch Linux Mirror Status API. The default is %(default)s.' ) parser.add_argument( '--url', default=URL, help='The URL from which to retrieve the mirror data in JSON format. If different from the default, it must follow the same format. Default: %(default)s' ) parser.add_argument( '--save', metavar='', help='Save the mirrorlist to the given path.' ) sort_help = '; '.join('"{}": {}'.format(k, v) for k, v in SORT_TYPES.items()) parser.add_argument( '--sort', choices=SORT_TYPES, help='Sort the mirrorlist. {}.'.format(sort_help) ) parser.add_argument( '--threads', type=int, metavar='n', default=DEFAULT_N_THREADS, help='The maximum number of threads to use when rating mirrors. Keep in mind that this may skew your results if your connection is saturated. Default: %(default)s (number of detected CPUs)' ) parser.add_argument( '--verbose', action='store_true', help='Print extra information to STDERR. Only works with some options.' ) parser.add_argument( '--info', action='store_true', help='Print mirror information instead of a mirror list. Filter options apply.' ) filters = parser.add_argument_group( 'filters', 'The following filters are inclusive, i.e. the returned list will only contain mirrors for which all of the given conditions are met.' ) filters.add_argument( '-a', '--age', type=float, metavar='n', help='Only return mirrors that have synchronized in the last n hours. n may be an integer or a decimal number.' ) filters.add_argument( '-c', '--country', dest='countries', action='append', metavar='', help='Match one of the given countries (case-sensitive). Multiple countries may be selected using commas (e.g. "France,Germany") or by passing this option multiple times. Use "--list-countries" to see which are available.' ) filters.add_argument( '-f', '--fastest', type=int, metavar='n', help='Return the n fastest mirrors that meet the other criteria. Do not use this option without other filtering options.' ) filters.add_argument( '-i', '--include', metavar='', action='append', help='Include servers that match , where is a Python regular express.' ) filters.add_argument( '-x', '--exclude', metavar='', action='append', help='Exclude servers that match , where is a Python regular express.' ) filters.add_argument( '-l', '--latest', type=int, metavar='n', help='Limit the list to the n most recently synchronized servers.' ) filters.add_argument( '--score', type=int, metavar='n', help='Limit the list to the n servers with the highest score.' ) filters.add_argument( '-n', '--number', type=int, metavar='n', help='Return at most n mirrors.' ) filters.add_argument( '-p', '--protocol', dest='protocols', action='append', metavar='', help='Match one of the given protocols, e.g. "https" or "ftp". Multiple protocols may be selected using commas (e.g. "https,http") or by passing this option multiple times.' ) filters.add_argument( '--completion-percent', type=float, metavar='[0-100]', default=100., help='Set the minimum completion percent for the returned mirrors. Check the mirrorstatus webpage for the meaning of this parameter. Default value: %(default)s.' ) filters.add_argument( '--isos', action='store_true', help='Only return mirrors that host ISOs.' ) filters.add_argument( '--ipv4', action='store_true', help='Only return mirrors that support IPv4.' ) filters.add_argument( '--ipv6', action='store_true', help='Only return mirrors that support IPv6.' ) return parser def split_list_args(args): ''' Split comma-separated list arguments into separate list elements. ''' if not args: return for arg in args: yield from arg.split(',') class MyArgumentParser(argparse.ArgumentParser): ''' Custom argument parser to support a more readable format in argument files. ''' def convert_arg_line_to_args(self, arg_line): # Support comments and blank lines. # content = arg_line.strip() # if not content or content.startswith('#'): # return list() return shlex.split(arg_line, comments=True) def parse_args(args=None): ''' Parse command-line arguments. ''' parser = MyArgumentParser( description='retrieve and filter a list of the latest Arch Linux mirrors', fromfile_prefix_chars='@' ) parser = add_arguments(parser) options = parser.parse_args(args) for list_arg in ('countries', 'protocols'): setattr(options, list_arg, list(split_list_args(getattr(options, list_arg)))) return options # Process options def process_options(options, mirrorstatus=None, mirrors=None): ''' Process options. Optionally accepts a MirrorStatus object and/or the mirrors as returned by the MirrorStatus.get_mirrors method. ''' if not mirrorstatus: mirrorstatus = MirrorStatus( connection_timeout=options.connection_timeout, # download_timeout=options.download_timeout, cache_timeout=options.cache_timeout, min_completion_pct=(options.completion_percent / 100.), threads=options.threads, url=options.url ) if mirrors is None: mirrors = mirrorstatus.get_mirrors() # Filter mirrors = mirrorstatus.filter( mirrors, countries=options.countries, include=options.include, exclude=options.exclude, age=options.age, protocols=options.protocols, isos=options.isos, ipv4=options.ipv4, ipv6=options.ipv6 ) if options.latest and options.latest > 0: mirrors = mirrorstatus.sort(mirrors, by='age') mirrors = itertools.islice(mirrors, options.latest) if options.score and options.score > 0: mirrors = mirrorstatus.sort(mirrors, by='score') mirrors = itertools.islice(mirrors, options.score) if options.fastest and options.fastest > 0: mirrors = mirrorstatus.sort(mirrors, by='rate') mirrors = itertools.islice(mirrors, options.fastest) if options.sort and not (options.sort == 'rate' and options.fastest): mirrors = mirrorstatus.sort(mirrors, by=options.sort) if options.number: mirrors = list(mirrors)[:options.number] return mirrorstatus, mirrors def main(args=None, configure_logging=False): # pylint: disable=too-many-branches ''' Process command-line arguments and generate a mirror list. ''' if args: cmd = tuple(args) else: cmd = sys.argv[1:] options = parse_args(args) # Configure logging. logger = get_logger() if configure_logging: if options.verbose: level = logging.INFO else: level = logging.WARNING logger.setLevel(level) handler = logging.StreamHandler() formatter = logging.Formatter( fmt='[{asctime:s}] {levelname:s}: {message:s}', style='{', datefmt='%Y-%m-%d %H:%M:%S' ) handler.setFormatter(formatter) logger.addHandler(handler) try: mirrorstatus, mirrors = process_options(options) if mirrors is not None and not isinstance(mirrors, list): mirrors = list(mirrors) if not mirrors: sys.exit('error: no mirrors found') include_country = options.sort == 'country' # Convert the generator object to a list for re-use later. if options.info: print_mirror_info(mirrors) return mirrorlist = mirrorstatus.get_mirrorlist(mirrors, include_country=include_country, cmd=cmd) if mirrorlist is None: sys.exit('error: no mirrors found') except MirrorStatusError as err: sys.exit('error: {}\n'.format(err.msg)) if options.save: try: with open(options.save, 'w') as handle: handle.write(mirrorlist) except IOError as err: sys.exit('error: {}\n'.format(err.strerror)) else: print(mirrorlist) def run_main(args=None, **kwargs): ''' Wrapper around main to handler keyboard interrupts. I don't remember why I added this instead of catching the error under the "if __name__" block. ''' try: main(args, **kwargs) except KeyboardInterrupt: pass if __name__ == "__main__": run_main(configure_logging=True)