| @@ -1,6 +1,7 @@ | |||
| #!/bin/python3 | |||
| import re | |||
| import random | |||
| import argparse | |||
| import logging | |||
| import asyncio | |||
| @@ -15,7 +16,7 @@ ns = {'': 'http://maven.apache.org/POM/4.0.0'} | |||
| ET.register_namespace('', ns['']) | |||
| baseurl = 'https://search.maven.org' | |||
| base_pom_path = Path('poms') | |||
| output_path: Path = Path() | |||
| mirrors = [ | |||
| "https://repo.maven.apache.org/maven2", | |||
| "https://repo1.maven.org/maven2", | |||
| @@ -33,6 +34,7 @@ num_workers = 50 | |||
| class PackagePOM: | |||
| def __init__(self, package: 'Package', pom: str): | |||
| logger.debug(f'{package}: Parsing POM') | |||
| self._package = package | |||
| self.raw_root = ET.fromstring(pom) | |||
| if (packaging := self.raw_root.find('packaging', ns)) is not None: | |||
| @@ -43,64 +45,12 @@ class PackagePOM: | |||
| self.is_bom = self.packaging == 'pom' | |||
| if self.packaging == 'pom': | |||
| root_copy = copy.deepcopy(self.raw_root) | |||
| depman = root_copy.find('dependencyManagement', ns) | |||
| if depman is not None: | |||
| root_copy.extend(depman.findall('*')) | |||
| root_copy.remove(depman) | |||
| if (groupId := root_copy.find('groupId', ns)) is not None: | |||
| groupId.text = f'tmp.{package.groupId}' | |||
| else: | |||
| logger.warning(f"{package}: No groupId tag in pom") | |||
| if (artifactId := root_copy.find('groupId', ns)) is not None: | |||
| artifactId.text = f'placeholder.{package.artifactId}' | |||
| else: | |||
| logger.warning(f"{package}: No artifactId tag in pom") | |||
| # Add a dependency for the pom itself | |||
| if (dependencies := root_copy.find('dependencies', ns)) is not None: | |||
| self_dep = ET.SubElement(dependencies, 'dependency') | |||
| ET.SubElement(self_dep, 'groupId').text = package.groupId | |||
| ET.SubElement(self_dep, 'artifactId').text = package.artifactId | |||
| ET.SubElement(self_dep, 'version').text = package.version | |||
| else: | |||
| logger.warning(f"{package}: No dependencies tag in pom") | |||
| self.generated_root = root_copy | |||
| self.packages = [package, *self.dependency_management] | |||
| else: | |||
| self.generated_root = ET.fromstring( | |||
| f""" | |||
| <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 | |||
| https://maven.apache.org/xsd/maven-4.0.0.xsd" | |||
| xmlns="http://maven.apache.org/POM/4.0.0" | |||
| xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> | |||
| <modelVersion>4.0.0</modelVersion> | |||
| <groupId>tmp.{package.groupId}</groupId> | |||
| <artifactId>placeholder-{package.artifactId}</artifactId> | |||
| <version>{package.version}</version> | |||
| <name>Package {package.artifactId}</name> | |||
| <dependencies> | |||
| <dependency> | |||
| <groupId>{package.groupId}</groupId> | |||
| <artifactId>{package.artifactId}</artifactId> | |||
| <version>{package.version}</version> | |||
| </dependency> | |||
| </dependencies> | |||
| </project> | |||
| """ | |||
| ) | |||
| self.packages = [package] | |||
| logger.debug(f'{package}: POM parsed') | |||
| def write(self, f): | |||
| tree = ET.ElementTree(self.generated_root) | |||
| ET.indent(tree) | |||
| tree.write(f) | |||
| def get_property(self, prop: str): | |||
| elem = self.raw_root.find(f'.//properties/{prop}', ns) | |||
| if elem is not None: | |||
| @@ -108,18 +58,33 @@ class PackagePOM: | |||
| else: | |||
| return None | |||
| def _package_from_xml_dep(self, dep: ET.Element): | |||
| def prop_replace(match): | |||
| def _package_from_xml_dep(self, dep: ET.Element) -> 'Package': | |||
| def lookup_prop(match) -> str: | |||
| prop = match.group(1) | |||
| value = self.get_property(match.group(1)) | |||
| logger.debug(f'Replacing property {prop} with {value}') | |||
| if prop == 'project.groupId': | |||
| value = str(self._package.groupId) | |||
| elif prop == 'project.artifactId': | |||
| value = str(self._package.artifactId) | |||
| elif prop == 'project.version': | |||
| value = str(self._package.version) | |||
| else: | |||
| value = prop_replace(self.get_property(prop)) | |||
| logger.debug(f'{self._package}: Replacing property {prop} with {value}') | |||
| return value | |||
| def prop_replace(text) -> str: | |||
| return re.sub( | |||
| r'\$\{([^\}]*)\}', | |||
| lookup_prop, | |||
| text, | |||
| ) | |||
| return Package( | |||
| *[ | |||
| re.sub( | |||
| r'\$\{([^\}]*)\}', | |||
| prop_replace, | |||
| prop_replace( | |||
| elem.text or '' if (elem := dep.find(tag, ns)) is not None else '', | |||
| ) | |||
| @@ -146,7 +111,7 @@ class Package: | |||
| _pom: PackagePOM | None = None | |||
| _verified: bool = False | |||
| def __init__(self, groupId: str, artifactId: str, version: str = None): | |||
| def __init__(self, groupId: str, artifactId: str, version: str | None = None): | |||
| self.groupId = groupId | |||
| self.artifactId = artifactId | |||
| self.version = version | |||
| @@ -186,8 +151,10 @@ class Package: | |||
| logger.debug(f'{self}: {extension} downloaded') | |||
| return await response.text() | |||
| break | |||
| elif response.status == 429: | |||
| logger.error(f'{self}: HTTP error 429 (Too many requests). Retry after {response.headers["Retry-After"]}') | |||
| else: | |||
| logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}') | |||
| logger.error(f'{self}: HTTP error {response.status} from mirror {mirror}') | |||
| else: | |||
| logger.warning(f'{self}: File download of {extension} failed for all mirrors') | |||
| return None | |||
| @@ -228,14 +195,14 @@ class Package: | |||
| self._verified = True | |||
| if self.version is None: | |||
| version = message['response']['docs'][0]['latestVersion'] | |||
| logger.debug(f'{self}: Using newest version {version}') | |||
| self.version = version | |||
| logger.debug(f'{self}: Using newest version {version}') | |||
| else: | |||
| logger.warning(f'{self}: No matching packages found') | |||
| self._verified = False | |||
| else: | |||
| self._verified = False | |||
| logger.warning(f'{self}: HTTP error {response.status} downloading pom') | |||
| logger.error(f'{self}: HTTP error {response.status} downloading pom') | |||
| async def verify(self) -> bool: | |||
| if not self._verified: | |||
| @@ -270,26 +237,19 @@ async def download(package: Package, queue: asyncio.Queue) -> None: | |||
| if skip: | |||
| logger.info(f'{package}: Already downloaded. Skipping.') | |||
| elif await package.verify(): | |||
| async with done_lock: | |||
| done.add(str(package)) | |||
| pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}' | |||
| pom_path = pom_dir / 'pom.xml' | |||
| pom_dir.mkdir(exist_ok=True) | |||
| pom = await package.pom | |||
| if not pom: | |||
| return | |||
| pom.write(pom_path) | |||
| logger.info(f'{package}: Downloaded') | |||
| if pom: | |||
| logger.info(f'{package}: Done') | |||
| async with done_lock: | |||
| for p in pom.packages: | |||
| if not p.version: | |||
| logger.warning(f'{p}: No version found!') | |||
| if not pom.is_bom: | |||
| for dep in pom.dependency_management: | |||
| logger.info(f'{package}: Handling transitive dependency {dep}') | |||
| await queue.put(dep) | |||
| logger.debug(f'{p}: Adding from BOM') | |||
| done.add(str(p)) | |||
| else: | |||
| logger.warning(f'{package}: No POM for package') | |||
| else: | |||
| logger.warning(f'{package}: Package not found. Check package name and internet connection') | |||
| @@ -298,6 +258,7 @@ async def worker(queue: asyncio.Queue) -> None: | |||
| while True: | |||
| package = await queue.get() | |||
| await download(package, queue) | |||
| await asyncio.sleep(random.random()) | |||
| queue.task_done() | |||
| @@ -323,8 +284,11 @@ async def main() -> None: | |||
| await asyncio.gather(*tasks, return_exceptions=True) | |||
| logger.info('Generating master POM') | |||
| subprocess.call(['sh', 'generate_master_pom.sh']) | |||
| logger.info('Generating list of all packages') | |||
| async with done_lock: | |||
| with open(output_path, 'w') as f: | |||
| for p in done: | |||
| f.write(p + '\n') | |||
| logger = logging.getLogger(__name__) | |||
| @@ -333,6 +297,7 @@ if __name__ == '__main__': | |||
| parser = argparse.ArgumentParser() | |||
| parser.add_argument('-w', '--workers', type=int, default=num_workers) | |||
| parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0) | |||
| parser.add_argument('-o', '--output', type=Path, default=Path('full-package-list.txt')) | |||
| args = parser.parse_args() | |||
| if args.verbosity == 0: | |||
| @@ -345,5 +310,6 @@ if __name__ == '__main__': | |||
| logging.basicConfig(level=log_level) | |||
| num_workers = args.workers | |||
| output_path = args.output | |||
| asyncio.run(main()) | |||