diff --git a/update-poms.py b/update-poms.py index 59edb4b..bcf2a07 100755 --- a/update-poms.py +++ b/update-poms.py @@ -1,6 +1,7 @@ #!/bin/python3 import re +import random import argparse import logging import asyncio @@ -15,7 +16,7 @@ ns = {'': 'http://maven.apache.org/POM/4.0.0'} ET.register_namespace('', ns['']) baseurl = 'https://search.maven.org' -base_pom_path = Path('poms') +output_path: Path = Path() mirrors = [ "https://repo.maven.apache.org/maven2", "https://repo1.maven.org/maven2", @@ -33,6 +34,7 @@ num_workers = 50 class PackagePOM: def __init__(self, package: 'Package', pom: str): logger.debug(f'{package}: Parsing POM') + self._package = package self.raw_root = ET.fromstring(pom) if (packaging := self.raw_root.find('packaging', ns)) is not None: @@ -43,64 +45,12 @@ class PackagePOM: self.is_bom = self.packaging == 'pom' if self.packaging == 'pom': - root_copy = copy.deepcopy(self.raw_root) - depman = root_copy.find('dependencyManagement', ns) - if depman is not None: - root_copy.extend(depman.findall('*')) - root_copy.remove(depman) - - if (groupId := root_copy.find('groupId', ns)) is not None: - groupId.text = f'tmp.{package.groupId}' - else: - logger.warning(f"{package}: No groupId tag in pom") - - if (artifactId := root_copy.find('groupId', ns)) is not None: - artifactId.text = f'placeholder.{package.artifactId}' - else: - logger.warning(f"{package}: No artifactId tag in pom") - - # Add a dependency for the pom itself - if (dependencies := root_copy.find('dependencies', ns)) is not None: - self_dep = ET.SubElement(dependencies, 'dependency') - ET.SubElement(self_dep, 'groupId').text = package.groupId - ET.SubElement(self_dep, 'artifactId').text = package.artifactId - ET.SubElement(self_dep, 'version').text = package.version - else: - logger.warning(f"{package}: No dependencies tag in pom") - - self.generated_root = root_copy + self.packages = [package, *self.dependency_management] else: - self.generated_root = ET.fromstring( - f""" - - - 4.0.0 - tmp.{package.groupId} - placeholder-{package.artifactId} - {package.version} - Package {package.artifactId} - - - - {package.groupId} - {package.artifactId} - {package.version} - - - - """ - ) + self.packages = [package] logger.debug(f'{package}: POM parsed') - def write(self, f): - tree = ET.ElementTree(self.generated_root) - ET.indent(tree) - tree.write(f) - def get_property(self, prop: str): elem = self.raw_root.find(f'.//properties/{prop}', ns) if elem is not None: @@ -108,18 +58,33 @@ class PackagePOM: else: return None - def _package_from_xml_dep(self, dep: ET.Element): - def prop_replace(match): + def _package_from_xml_dep(self, dep: ET.Element) -> 'Package': + def lookup_prop(match) -> str: prop = match.group(1) - value = self.get_property(match.group(1)) - logger.debug(f'Replacing property {prop} with {value}') + + if prop == 'project.groupId': + value = str(self._package.groupId) + elif prop == 'project.artifactId': + value = str(self._package.artifactId) + elif prop == 'project.version': + value = str(self._package.version) + else: + value = prop_replace(self.get_property(prop)) + + logger.debug(f'{self._package}: Replacing property {prop} with {value}') return value + def prop_replace(text) -> str: + return re.sub( + r'\$\{([^\}]*)\}', + lookup_prop, + text, + ) + + return Package( *[ - re.sub( - r'\$\{([^\}]*)\}', - prop_replace, + prop_replace( elem.text or '' if (elem := dep.find(tag, ns)) is not None else '', ) @@ -146,7 +111,7 @@ class Package: _pom: PackagePOM | None = None _verified: bool = False - def __init__(self, groupId: str, artifactId: str, version: str = None): + def __init__(self, groupId: str, artifactId: str, version: str | None = None): self.groupId = groupId self.artifactId = artifactId self.version = version @@ -186,8 +151,10 @@ class Package: logger.debug(f'{self}: {extension} downloaded') return await response.text() break + elif response.status == 429: + logger.error(f'{self}: HTTP error 429 (Too many requests). Retry after {response.headers["Retry-After"]}') else: - logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}') + logger.error(f'{self}: HTTP error {response.status} from mirror {mirror}') else: logger.warning(f'{self}: File download of {extension} failed for all mirrors') return None @@ -228,14 +195,14 @@ class Package: self._verified = True if self.version is None: version = message['response']['docs'][0]['latestVersion'] - logger.debug(f'{self}: Using newest version {version}') self.version = version + logger.debug(f'{self}: Using newest version {version}') else: logger.warning(f'{self}: No matching packages found') self._verified = False else: self._verified = False - logger.warning(f'{self}: HTTP error {response.status} downloading pom') + logger.error(f'{self}: HTTP error {response.status} downloading pom') async def verify(self) -> bool: if not self._verified: @@ -270,26 +237,19 @@ async def download(package: Package, queue: asyncio.Queue) -> None: if skip: logger.info(f'{package}: Already downloaded. Skipping.') elif await package.verify(): - async with done_lock: - done.add(str(package)) - - pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}' - pom_path = pom_dir / 'pom.xml' - - pom_dir.mkdir(exist_ok=True) - pom = await package.pom - if not pom: - return - - pom.write(pom_path) - logger.info(f'{package}: Downloaded') + if pom: + logger.info(f'{package}: Done') + async with done_lock: + for p in pom.packages: + if not p.version: + logger.warning(f'{p}: No version found!') - if not pom.is_bom: - for dep in pom.dependency_management: - logger.info(f'{package}: Handling transitive dependency {dep}') - await queue.put(dep) + logger.debug(f'{p}: Adding from BOM') + done.add(str(p)) + else: + logger.warning(f'{package}: No POM for package') else: logger.warning(f'{package}: Package not found. Check package name and internet connection') @@ -298,6 +258,7 @@ async def worker(queue: asyncio.Queue) -> None: while True: package = await queue.get() await download(package, queue) + await asyncio.sleep(random.random()) queue.task_done() @@ -323,8 +284,11 @@ async def main() -> None: await asyncio.gather(*tasks, return_exceptions=True) - logger.info('Generating master POM') - subprocess.call(['sh', 'generate_master_pom.sh']) + logger.info('Generating list of all packages') + async with done_lock: + with open(output_path, 'w') as f: + for p in done: + f.write(p + '\n') logger = logging.getLogger(__name__) @@ -333,6 +297,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-w', '--workers', type=int, default=num_workers) parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0) + parser.add_argument('-o', '--output', type=Path, default=Path('full-package-list.txt')) args = parser.parse_args() if args.verbosity == 0: @@ -345,5 +310,6 @@ if __name__ == '__main__': logging.basicConfig(level=log_level) num_workers = args.workers + output_path = args.output asyncio.run(main())