| @@ -1,6 +1,7 @@ | |||||
| #!/bin/python3 | #!/bin/python3 | ||||
| import re | import re | ||||
| import random | |||||
| import argparse | import argparse | ||||
| import logging | import logging | ||||
| import asyncio | import asyncio | ||||
| @@ -15,7 +16,7 @@ ns = {'': 'http://maven.apache.org/POM/4.0.0'} | |||||
| ET.register_namespace('', ns['']) | ET.register_namespace('', ns['']) | ||||
| baseurl = 'https://search.maven.org' | baseurl = 'https://search.maven.org' | ||||
| base_pom_path = Path('poms') | |||||
| output_path: Path = Path() | |||||
| mirrors = [ | mirrors = [ | ||||
| "https://repo.maven.apache.org/maven2", | "https://repo.maven.apache.org/maven2", | ||||
| "https://repo1.maven.org/maven2", | "https://repo1.maven.org/maven2", | ||||
| @@ -33,6 +34,7 @@ num_workers = 50 | |||||
| class PackagePOM: | class PackagePOM: | ||||
| def __init__(self, package: 'Package', pom: str): | def __init__(self, package: 'Package', pom: str): | ||||
| logger.debug(f'{package}: Parsing POM') | logger.debug(f'{package}: Parsing POM') | ||||
| self._package = package | |||||
| self.raw_root = ET.fromstring(pom) | self.raw_root = ET.fromstring(pom) | ||||
| if (packaging := self.raw_root.find('packaging', ns)) is not None: | if (packaging := self.raw_root.find('packaging', ns)) is not None: | ||||
| @@ -43,64 +45,12 @@ class PackagePOM: | |||||
| self.is_bom = self.packaging == 'pom' | self.is_bom = self.packaging == 'pom' | ||||
| if self.packaging == 'pom': | if self.packaging == 'pom': | ||||
| root_copy = copy.deepcopy(self.raw_root) | |||||
| depman = root_copy.find('dependencyManagement', ns) | |||||
| if depman is not None: | |||||
| root_copy.extend(depman.findall('*')) | |||||
| root_copy.remove(depman) | |||||
| if (groupId := root_copy.find('groupId', ns)) is not None: | |||||
| groupId.text = f'tmp.{package.groupId}' | |||||
| else: | |||||
| logger.warning(f"{package}: No groupId tag in pom") | |||||
| if (artifactId := root_copy.find('groupId', ns)) is not None: | |||||
| artifactId.text = f'placeholder.{package.artifactId}' | |||||
| else: | |||||
| logger.warning(f"{package}: No artifactId tag in pom") | |||||
| # Add a dependency for the pom itself | |||||
| if (dependencies := root_copy.find('dependencies', ns)) is not None: | |||||
| self_dep = ET.SubElement(dependencies, 'dependency') | |||||
| ET.SubElement(self_dep, 'groupId').text = package.groupId | |||||
| ET.SubElement(self_dep, 'artifactId').text = package.artifactId | |||||
| ET.SubElement(self_dep, 'version').text = package.version | |||||
| else: | |||||
| logger.warning(f"{package}: No dependencies tag in pom") | |||||
| self.generated_root = root_copy | |||||
| self.packages = [package, *self.dependency_management] | |||||
| else: | else: | ||||
| self.generated_root = ET.fromstring( | |||||
| f""" | |||||
| <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 | |||||
| https://maven.apache.org/xsd/maven-4.0.0.xsd" | |||||
| xmlns="http://maven.apache.org/POM/4.0.0" | |||||
| xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> | |||||
| <modelVersion>4.0.0</modelVersion> | |||||
| <groupId>tmp.{package.groupId}</groupId> | |||||
| <artifactId>placeholder-{package.artifactId}</artifactId> | |||||
| <version>{package.version}</version> | |||||
| <name>Package {package.artifactId}</name> | |||||
| <dependencies> | |||||
| <dependency> | |||||
| <groupId>{package.groupId}</groupId> | |||||
| <artifactId>{package.artifactId}</artifactId> | |||||
| <version>{package.version}</version> | |||||
| </dependency> | |||||
| </dependencies> | |||||
| </project> | |||||
| """ | |||||
| ) | |||||
| self.packages = [package] | |||||
| logger.debug(f'{package}: POM parsed') | logger.debug(f'{package}: POM parsed') | ||||
| def write(self, f): | |||||
| tree = ET.ElementTree(self.generated_root) | |||||
| ET.indent(tree) | |||||
| tree.write(f) | |||||
| def get_property(self, prop: str): | def get_property(self, prop: str): | ||||
| elem = self.raw_root.find(f'.//properties/{prop}', ns) | elem = self.raw_root.find(f'.//properties/{prop}', ns) | ||||
| if elem is not None: | if elem is not None: | ||||
| @@ -108,18 +58,33 @@ class PackagePOM: | |||||
| else: | else: | ||||
| return None | return None | ||||
| def _package_from_xml_dep(self, dep: ET.Element): | |||||
| def prop_replace(match): | |||||
| def _package_from_xml_dep(self, dep: ET.Element) -> 'Package': | |||||
| def lookup_prop(match) -> str: | |||||
| prop = match.group(1) | prop = match.group(1) | ||||
| value = self.get_property(match.group(1)) | |||||
| logger.debug(f'Replacing property {prop} with {value}') | |||||
| if prop == 'project.groupId': | |||||
| value = str(self._package.groupId) | |||||
| elif prop == 'project.artifactId': | |||||
| value = str(self._package.artifactId) | |||||
| elif prop == 'project.version': | |||||
| value = str(self._package.version) | |||||
| else: | |||||
| value = prop_replace(self.get_property(prop)) | |||||
| logger.debug(f'{self._package}: Replacing property {prop} with {value}') | |||||
| return value | return value | ||||
| def prop_replace(text) -> str: | |||||
| return re.sub( | |||||
| r'\$\{([^\}]*)\}', | |||||
| lookup_prop, | |||||
| text, | |||||
| ) | |||||
| return Package( | return Package( | ||||
| *[ | *[ | ||||
| re.sub( | |||||
| r'\$\{([^\}]*)\}', | |||||
| prop_replace, | |||||
| prop_replace( | |||||
| elem.text or '' if (elem := dep.find(tag, ns)) is not None else '', | elem.text or '' if (elem := dep.find(tag, ns)) is not None else '', | ||||
| ) | ) | ||||
| @@ -146,7 +111,7 @@ class Package: | |||||
| _pom: PackagePOM | None = None | _pom: PackagePOM | None = None | ||||
| _verified: bool = False | _verified: bool = False | ||||
| def __init__(self, groupId: str, artifactId: str, version: str = None): | |||||
| def __init__(self, groupId: str, artifactId: str, version: str | None = None): | |||||
| self.groupId = groupId | self.groupId = groupId | ||||
| self.artifactId = artifactId | self.artifactId = artifactId | ||||
| self.version = version | self.version = version | ||||
| @@ -186,8 +151,10 @@ class Package: | |||||
| logger.debug(f'{self}: {extension} downloaded') | logger.debug(f'{self}: {extension} downloaded') | ||||
| return await response.text() | return await response.text() | ||||
| break | break | ||||
| elif response.status == 429: | |||||
| logger.error(f'{self}: HTTP error 429 (Too many requests). Retry after {response.headers["Retry-After"]}') | |||||
| else: | else: | ||||
| logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}') | |||||
| logger.error(f'{self}: HTTP error {response.status} from mirror {mirror}') | |||||
| else: | else: | ||||
| logger.warning(f'{self}: File download of {extension} failed for all mirrors') | logger.warning(f'{self}: File download of {extension} failed for all mirrors') | ||||
| return None | return None | ||||
| @@ -228,14 +195,14 @@ class Package: | |||||
| self._verified = True | self._verified = True | ||||
| if self.version is None: | if self.version is None: | ||||
| version = message['response']['docs'][0]['latestVersion'] | version = message['response']['docs'][0]['latestVersion'] | ||||
| logger.debug(f'{self}: Using newest version {version}') | |||||
| self.version = version | self.version = version | ||||
| logger.debug(f'{self}: Using newest version {version}') | |||||
| else: | else: | ||||
| logger.warning(f'{self}: No matching packages found') | logger.warning(f'{self}: No matching packages found') | ||||
| self._verified = False | self._verified = False | ||||
| else: | else: | ||||
| self._verified = False | self._verified = False | ||||
| logger.warning(f'{self}: HTTP error {response.status} downloading pom') | |||||
| logger.error(f'{self}: HTTP error {response.status} downloading pom') | |||||
| async def verify(self) -> bool: | async def verify(self) -> bool: | ||||
| if not self._verified: | if not self._verified: | ||||
| @@ -270,26 +237,19 @@ async def download(package: Package, queue: asyncio.Queue) -> None: | |||||
| if skip: | if skip: | ||||
| logger.info(f'{package}: Already downloaded. Skipping.') | logger.info(f'{package}: Already downloaded. Skipping.') | ||||
| elif await package.verify(): | elif await package.verify(): | ||||
| async with done_lock: | |||||
| done.add(str(package)) | |||||
| pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}' | |||||
| pom_path = pom_dir / 'pom.xml' | |||||
| pom_dir.mkdir(exist_ok=True) | |||||
| pom = await package.pom | pom = await package.pom | ||||
| if not pom: | |||||
| return | |||||
| pom.write(pom_path) | |||||
| logger.info(f'{package}: Downloaded') | |||||
| if pom: | |||||
| logger.info(f'{package}: Done') | |||||
| async with done_lock: | |||||
| for p in pom.packages: | |||||
| if not p.version: | |||||
| logger.warning(f'{p}: No version found!') | |||||
| if not pom.is_bom: | |||||
| for dep in pom.dependency_management: | |||||
| logger.info(f'{package}: Handling transitive dependency {dep}') | |||||
| await queue.put(dep) | |||||
| logger.debug(f'{p}: Adding from BOM') | |||||
| done.add(str(p)) | |||||
| else: | |||||
| logger.warning(f'{package}: No POM for package') | |||||
| else: | else: | ||||
| logger.warning(f'{package}: Package not found. Check package name and internet connection') | logger.warning(f'{package}: Package not found. Check package name and internet connection') | ||||
| @@ -298,6 +258,7 @@ async def worker(queue: asyncio.Queue) -> None: | |||||
| while True: | while True: | ||||
| package = await queue.get() | package = await queue.get() | ||||
| await download(package, queue) | await download(package, queue) | ||||
| await asyncio.sleep(random.random()) | |||||
| queue.task_done() | queue.task_done() | ||||
| @@ -323,8 +284,11 @@ async def main() -> None: | |||||
| await asyncio.gather(*tasks, return_exceptions=True) | await asyncio.gather(*tasks, return_exceptions=True) | ||||
| logger.info('Generating master POM') | |||||
| subprocess.call(['sh', 'generate_master_pom.sh']) | |||||
| logger.info('Generating list of all packages') | |||||
| async with done_lock: | |||||
| with open(output_path, 'w') as f: | |||||
| for p in done: | |||||
| f.write(p + '\n') | |||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||
| @@ -333,6 +297,7 @@ if __name__ == '__main__': | |||||
| parser = argparse.ArgumentParser() | parser = argparse.ArgumentParser() | ||||
| parser.add_argument('-w', '--workers', type=int, default=num_workers) | parser.add_argument('-w', '--workers', type=int, default=num_workers) | ||||
| parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0) | parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0) | ||||
| parser.add_argument('-o', '--output', type=Path, default=Path('full-package-list.txt')) | |||||
| args = parser.parse_args() | args = parser.parse_args() | ||||
| if args.verbosity == 0: | if args.verbosity == 0: | ||||
| @@ -345,5 +310,6 @@ if __name__ == '__main__': | |||||
| logging.basicConfig(level=log_level) | logging.basicConfig(level=log_level) | ||||
| num_workers = args.workers | num_workers = args.workers | ||||
| output_path = args.output | |||||
| asyncio.run(main()) | asyncio.run(main()) | ||||