#!/bin/python3 import re import copy import random import argparse import logging import asyncio import subprocess import copy import aiohttp from pathlib import Path from xml.etree import ElementTree as ET ns = {'': 'http://maven.apache.org/POM/4.0.0'} ET.register_namespace('', ns['']) baseurl = 'https://search.maven.org' base_pom_path = Path('poms') mirrors = [ "https://repo.maven.apache.org/maven2", "https://repo1.maven.org/maven2", "https://oss.sonatype.org/content/repositories/snapshots", "https://packages.confluent.io/maven", "https://registry.quarkus.io/maven", "https://plugins.gradle.org/m2", ] done: set[str] = set() done_lock = asyncio.Lock() in_progress: set[str] = set() in_progress_lock = asyncio.Lock() gradle_packages: set[str] = set() gradle_packages_lock = asyncio.Lock() global_properties: dict[str, dict[str, str]] = {} class TooManyRequestsException(Exception): pass class PackageError(Exception): pass class WaitForPackage(Exception): def __init__(self, package): self.package = package def find_tag_text(parent, tag) -> str | None: elem = parent.find(tag, ns) return elem.text if elem is not None else None class PackagePOM: def __init__(self, package: 'Package', pom: str): self._package = package logger.debug(f'{package}: Parsing POM') self.raw_root = ET.fromstring(pom) self.parent: Package | None = None if (parent_tag := self.raw_root.find('parent', ns)) is not None: parent_group = find_tag_text(parent_tag, 'groupId') parent_artifact = find_tag_text(parent_tag, 'artifactId') parent_version = find_tag_text(parent_tag, 'version') logger.debug(f'{package}: Parsing parent {parent_group}:{parent_artifact}:{parent_version}') if parent_group is not None and parent_artifact is not None and parent_version is not None: parent = Package( parent_group, parent_artifact, parent_version, ) if str(parent) in done: self.parent = parent else: raise WaitForPackage(parent) else: raise PackageError(f'Invalid parent {parent_group}:{parent_artifact}:{parent_version}') logger.debug(f'{package}: Parsing properties') parent_props: dict[str, str] = {} if self.parent is None else global_properties[str(self.parent)] self.properties = self.resolve_props(parent_props) global_properties[str(package)] = self.properties logger.debug(f'{package}: Parsing packaging') if (packaging := self.raw_root.find('packaging', ns)) is not None: self.packaging = packaging.text else: self.packaging = '??' self.is_bom = self.packaging == 'pom' self.gradle_packages = [str(package)] if self.is_bom: logger.debug(f'{package}: Parsing dependencyManagement') if (dependencyManagement := self.raw_root.find('dependencyManagement', ns)): if (dependencies := dependencyManagement.find('dependencies', ns)): packages = [] for dep in dependencies.findall('dependency', ns): groupId = find_tag_text(dep, 'groupId') artifactId = find_tag_text(dep, 'artifactId') version = find_tag_text(dep, 'version') if groupId is not None and artifactId is not None and version is not None: groupId = self.prop_replace(groupId) artifactId = self.prop_replace(artifactId) version = self.prop_replace(version) packages.append(f'{groupId}:{artifactId}:{version}') logger.debug(f'{package}: Adding {len(packages)} package(s) from dependencyManagement') self.gradle_packages.extend(packages) else: logger.warn(f'{package}: dependencyManagement has no dependencies') else: logger.warn(f'{package}: BOM has no dependencyManagement') logger.debug(f'{package}: POM parsed') def resolve_props(self, initial: dict[str, str]): props = initial for prop_tag in self.raw_root.findall('.//properties/*', ns): prop = prop_tag.tag.replace(f'{{{ns[""]}}}', '') value = prop_tag.text if prop_tag.text is not None else '' logger.debug(f'{self._package}: Setting prop {prop}={value}') props[prop] = value changed = True while changed: changed = False for prop, value in props.items(): new_value = self.prop_replace(value, props) if new_value != value: changed = True logger.debug(f'{self._package}: Setting prop {prop}={new_value}') props[prop] = new_value return props def prop_replace(self, text, props: dict[str, str] | None = None) -> str: def lookup_prop(match) -> str: prop = match.group(1) if prop == 'project.groupId': value = str(self._package.groupId) elif prop == 'project.artifactId': value = str(self._package.artifactId) elif prop == 'project.version': value = str(self._package.version) elif prop.startswith('project.build') or prop.startswith('env.') or prop.startswith('maven.'): value = '' elif prop in ['project.basedir', 'basedir', 'user.home', 'debug.port']: value = '' else: try: value = props[prop] if props is not None else self.properties[prop] except KeyError: logger.error(f'{self._package}: Could not find property {prop}. Setting it to ""') value = '' logger.debug(f'{self._package}: Replacing property {prop} with {value}') return value return re.sub( r'\$\{([^\}]*)\}', lookup_prop, text, ) def _package_from_xml_dep(self, dep: ET.Element) -> 'Package': def prop_replace_tag(tag) -> str: return self.prop_replace( elem.text or '' if (elem := dep.find(tag, ns)) is not None else '', ) return Package( groupId=prop_replace_tag('groupId'), artifactId=prop_replace_tag('artifactId'), version=prop_replace_tag('version'), ) @property def dependency_management(self) -> list['Package']: dependencies: list[Package] = [] for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []: package = self._package_from_xml_dep(dep) dependencies.append(package) return dependencies class Package: _pom: PackagePOM | None = None _verified: bool = False def __init__(self, groupId: str, artifactId: str, version: str | None = None, implicit: bool = False): self.groupId = groupId self.artifactId = artifactId self.version = version if version and not version.isspace() else None self.implicit = implicit def __str__(self) -> str: return f'{self.groupId}:{self.artifactId}:{self.version or "----"}' def __eq__(self, other) -> bool: return ( self.groupId == other.groupId and self.artifactId == other.artifactId and self.version == other.version ) def __hash__(self) -> int: return hash((self.groupId, self.artifactId, self.version)) @property def dir_path(self): group_path = self.groupId.replace(".", "/") return f'{group_path}/{self.artifactId}/{self.version}' @property def base_filename(self): return f'{self.artifactId}-{self.version}' async def download_file(self, extension): filepath = f'{self.dir_path}/{self.base_filename}.{extension}' async with aiohttp.ClientSession() as session: for mirror in mirrors: pom_url = f'{mirror}/{filepath}' logger.debug(f'{self}: Downloading {extension} from {pom_url}') async with session.get(pom_url) as response: if response.status == 200: logger.debug(f'{self}: {extension} downloaded') return await response.text() break elif response.status == 429: raise TooManyRequestsException() else: logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}') else: logger.warning(f'{self}: File download of {extension} failed for all mirrors') return None @property async def pom(self) -> PackagePOM: if self._pom is not None: return self._pom if self.version is None: await self._query_maven() self._pom = PackagePOM(self, await self.download_file('pom')) return self._pom @property def _urlquery(self) -> str: q = f'g:{self.groupId}+AND+a:{self.artifactId}' if self.version is not None: q += f'+AND+v:{self.version}' return q async def _query_maven(self) -> None: self._verified = False async with aiohttp.ClientSession() as session: for mirror in mirrors: url = f'{mirror}/{self.groupId.replace(".", "/")}/{self.artifactId}/maven-metadata.xml' logger.debug(f'{self}: Querying maven at url {url}') async with session.get(url) as response: if response.status == 200: response_text = await response.text() metadata = ET.fromstring(response_text) if metadata is not None: logger.debug(f'{self}: Metadata found') if self.version is None: release_tag = metadata.find('./versioning/release') latest_tag = metadata.find('./versioning/latest') version = release_tag.text if release_tag is not None else latest_tag.text if latest_tag is not None else None if version is not None: logger.debug(f'{self}: Using newest version {version}') self.version = version self._verified = True return else: logger.info(f'{self}: Could not find latest version in metadata from mirror {mirror}') else: if metadata.find(f'./versioning/versions/version[.="{self.version}"]') is not None: logger.debug(f'{self}: Version {self.version} is valid') self._verified = True return else: logger.info(f'{self}: Could not find version {self.version} in metadata from mirror {mirror}') else: logger.warning('{self}: Invalid XML for maven metadata: {response_text}') elif response.status == 429: raise TooManyRequestsException() else: logger.info(f'{self}: HTTP error {response.status} downloading maven metadata from {url}') else: if self.implicit: logger.info(f'{self}: Package not found in any mirror') else: logger.warning(f'{self}: Package not found in any mirror') async def verify(self) -> bool: if not self._verified: await self._query_maven() return self._verified def load_package_list(list_path: Path, queue: asyncio.Queue) -> None: logger.info(f'Parsing {list_path}') with list_path.open('r') as f: for line in f.readlines(): sections = line.strip().split(':') if len(sections) < 2 or len(sections) > 3: logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"') continue package = Package( sections[0], sections[1], sections[2] if len(sections) == 3 else None, ) queue.put_nowait(package) continue if not package.artifactId.endswith('-jvm'): queue.put_nowait( Package( package.groupId, f'{package.artifactId}-jvm', package.version, True, ) ) def create_gradle_build(packages, repo) -> str: return """// Generated, do not edit plugins { kotlin("jvm") version "1.7.20" } repositories { maven { url=uri("http://""" + repo + """/releases") isAllowInsecureProtocol=true } } val deps = listOf( """ + ',\n '.join(f'"{dep}"' for dep in sorted(packages)) + """ ).flatMap { listOf(it, it + ":sources", it + ":javadoc") }.map { configurations.create(it.replace(':', '_')) { isCanBeResolved = true isCanBeConsumed = false } to it } dependencies { deps.forEach { (conf, dep) -> conf(dep) } } tasks.register("downloadDependencies") { val logger = getLogger() doLast { deps.forEach { (conf, dep) -> try { conf.files } catch (e: Exception) { if (dep.endsWith(":sources")) { logger.warn("Package '$dep' has no sources") } else if (dep.endsWith(":javadoc")) { logger.warn("Package '$dep' has no javadoc") } else { logger.warn("Error while fetching '$dep': $e") } } } } } """ def create_gradle_settings(repo: str) -> str: return """// Generated, do not edit rootProject.name = "gradle sync job" pluginManagement { repositories { maven { url=uri("http://""" + repo + """/releases") isAllowInsecureProtocol=true } } } """ async def download(package: Package, queue: asyncio.Queue) -> None: async with done_lock: is_done = str(package) in done async with in_progress_lock: is_in_progress = str(package) in in_progress if is_done: logger.info(f'{package}: Already downloaded. Skipping.') elif is_in_progress: logger.info(f'{package}: Already in progress. Skipping.') else: async with in_progress_lock: in_progress.add(str(package)) for _ in range(50): try: verified = await package.verify() break except TooManyRequestsException: logger.info(f'{package}: Too many requests. Delaying next attempt') await asyncio.sleep(3*random.random() + 0.2) else: logger.error(f'{package}: Verification failed after 50 tries') exit(1) if verified: for _ in range(50): try: pom = await package.pom break except TooManyRequestsException: logger.info(f'{package}: Too many requests. Delaying next attempt') await asyncio.sleep(3*random.random() + 0.2) except WaitForPackage as e: logger.info(f'{package}: Waiting for {e.package}') async with in_progress_lock: if str(package) in in_progress: in_progress.remove(str(package)) if str(e.package) not in in_progress: await queue.put(e.package) await queue.put(package) return else: logger.error(f'{package}: POM parsing failed after 50 tries') exit(1) if not pom: logger.warn(f'{package}: No pom') return async with gradle_packages_lock: gradle_packages.update(pom.gradle_packages) if not pom.is_bom: for dep in pom.dependency_management: logger.info(f'{package}: Handling transitive dependency {dep}') await queue.put(dep) async with done_lock: logger.debug(f'{package}: Marking done') p = copy.copy(package) p.version = None done.add(str(package)) done.add(str(p)) async with in_progress_lock: if str(package) in in_progress: in_progress.remove(str(package)) else: p = copy.copy(package) p.version = None if str(p) in in_progress: in_progress.remove(str(p)) else: logger.warning(f'{package}: Package is done, but not marked as in progress') async def worker(queue: asyncio.Queue) -> None: while True: package = await queue.get() while True: try: await download(package, queue) break except PackageError: logger.exception(f'{package}: Error while processing package') break except Exception: logger.exception(f'{package}: Unknown error while processing package') break queue.task_done() async def main(package_list: Path, output_dir: Path, num_workers: int, gradle_repo: str) -> None: queue: asyncio.Queue = asyncio.Queue() tasks = [] load_package_list(package_list, queue) logger.debug(f'Starting {num_workers} workers') for i in range(num_workers): tasks.append( asyncio.create_task( worker(queue) ) ) await queue.join() logger.debug('Queue is empty. Cancelling workers') for task in tasks: task.cancel() await asyncio.gather(*tasks, return_exceptions=True) async with gradle_packages_lock: logger.info('Generating build.gradle.kts') (output_dir / 'build.gradle.kts').write_text(create_gradle_build(gradle_packages, gradle_repo)) logger.info('Generating settings.gradle.kts') (output_dir / 'settings.gradle.kts').write_text(create_gradle_settings(gradle_repo)) logger = logging.getLogger(__name__) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-w', '--workers', type=int, default=20) parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0) parser.add_argument('--repo', type=str, help="The repository gradle should use", required=True) parser.add_argument('--output_dir', type=Path, help="The directory to put the generated gradle files in", default=Path('.'), required=False) parser.add_argument('package_list', type=Path, help="The list of packages to download") args = parser.parse_args() if args.verbosity == 0: log_level = 'WARNING' elif args.verbosity == 1: log_level = 'INFO' else: log_level = 'DEBUG' logging.basicConfig(level=log_level) asyncio.run( main(args.package_list, args.output_dir, args.workers, args.repo) )