Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

316 Zeilen
9.5KB

  1. #!/bin/python3
  2. import re
  3. import random
  4. import argparse
  5. import logging
  6. import asyncio
  7. import subprocess
  8. import copy
  9. import aiohttp
  10. from pathlib import Path
  11. from xml.etree import ElementTree as ET
  12. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  13. ET.register_namespace('', ns[''])
  14. baseurl = 'https://search.maven.org'
  15. output_path: Path = Path()
  16. mirrors = [
  17. "https://repo.maven.apache.org/maven2",
  18. "https://repo1.maven.org/maven2",
  19. "https://oss.sonatype.org/content/repositories/snapshots",
  20. "https://packages.confluent.io/maven",
  21. "https://registry.quarkus.io/maven",
  22. "https://plugins.gradle.org/m2",
  23. ]
  24. done: set[str] = set()
  25. done_lock = asyncio.Lock()
  26. num_workers = 50
  27. class PackagePOM:
  28. def __init__(self, package: 'Package', pom: str):
  29. logger.debug(f'{package}: Parsing POM')
  30. self._package = package
  31. self.raw_root = ET.fromstring(pom)
  32. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  33. self.packaging = packaging.text
  34. else:
  35. self.packaging = '??'
  36. self.is_bom = self.packaging == 'pom'
  37. if self.packaging == 'pom':
  38. self.packages = [package, *self.dependency_management]
  39. else:
  40. self.packages = [package]
  41. logger.debug(f'{package}: POM parsed')
  42. def get_property(self, prop: str):
  43. elem = self.raw_root.find(f'.//properties/{prop}', ns)
  44. if elem is not None:
  45. return elem.text
  46. else:
  47. return None
  48. def _package_from_xml_dep(self, dep: ET.Element) -> 'Package':
  49. def lookup_prop(match) -> str:
  50. prop = match.group(1)
  51. if prop == 'project.groupId':
  52. value = str(self._package.groupId)
  53. elif prop == 'project.artifactId':
  54. value = str(self._package.artifactId)
  55. elif prop == 'project.version':
  56. value = str(self._package.version)
  57. else:
  58. value = prop_replace(self.get_property(prop))
  59. logger.debug(f'{self._package}: Replacing property {prop} with {value}')
  60. return value
  61. def prop_replace(text) -> str:
  62. return re.sub(
  63. r'\$\{([^\}]*)\}',
  64. lookup_prop,
  65. text,
  66. )
  67. return Package(
  68. *[
  69. prop_replace(
  70. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  71. )
  72. for tag in [
  73. 'groupId',
  74. 'artifactId',
  75. 'version',
  76. ]
  77. ]
  78. )
  79. @property
  80. def dependency_management(self) -> list['Package']:
  81. dependencies: list[Package] = []
  82. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  83. package = self._package_from_xml_dep(dep)
  84. dependencies.append(package)
  85. return dependencies
  86. class Package:
  87. _pom: PackagePOM | None = None
  88. _verified: bool = False
  89. def __init__(self, groupId: str, artifactId: str, version: str | None = None):
  90. self.groupId = groupId
  91. self.artifactId = artifactId
  92. self.version = version
  93. def __str__(self) -> str:
  94. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  95. def __eq__(self, other) -> bool:
  96. return (
  97. self.groupId == other.groupId
  98. and self.artifactId == other.artifactId
  99. and self.version == other.version
  100. )
  101. def __hash__(self) -> int:
  102. return hash((self.groupId, self.artifactId, self.version))
  103. @property
  104. def dir_path(self):
  105. group_path = self.groupId.replace(".", "/")
  106. return f'{group_path}/{self.artifactId}/{self.version}'
  107. @property
  108. def base_filename(self):
  109. return f'{self.artifactId}-{self.version}'
  110. async def download_file(self, extension):
  111. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  112. async with aiohttp.ClientSession() as session:
  113. for mirror in mirrors:
  114. pom_url = f'{mirror}/{filepath}'
  115. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  116. async with session.get(pom_url) as response:
  117. if response.status == 200:
  118. logger.debug(f'{self}: {extension} downloaded')
  119. return await response.text()
  120. break
  121. elif response.status == 429:
  122. logger.error(f'{self}: HTTP error 429 (Too many requests). Retry after {response.headers["Retry-After"]}')
  123. else:
  124. logger.error(f'{self}: HTTP error {response.status} from mirror {mirror}')
  125. else:
  126. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  127. return None
  128. @property
  129. async def pom(self) -> PackagePOM:
  130. if self._pom is not None:
  131. return self._pom
  132. if self.version is None:
  133. await self._query_maven()
  134. self._pom = PackagePOM(self, await self.download_file('pom'))
  135. return self._pom
  136. @property
  137. def _urlquery(self) -> str:
  138. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  139. if self.version is not None:
  140. q += f'+AND+v:{self.version}'
  141. return q
  142. async def _query_maven(self) -> None:
  143. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  144. logger.debug(f'{self}: Querying maven at url {url}')
  145. async with aiohttp.ClientSession() as session:
  146. async with session.get(url) as response:
  147. if response.status == 200:
  148. message = await response.json()
  149. num = message['response']['numFound']
  150. if num:
  151. logger.debug(f'{self}: Query successful')
  152. self._verified = True
  153. if self.version is None:
  154. version = message['response']['docs'][0]['latestVersion']
  155. self.version = version
  156. logger.debug(f'{self}: Using newest version {version}')
  157. else:
  158. logger.warning(f'{self}: No matching packages found')
  159. self._verified = False
  160. else:
  161. self._verified = False
  162. logger.error(f'{self}: HTTP error {response.status} downloading pom')
  163. async def verify(self) -> bool:
  164. if not self._verified:
  165. await self._query_maven()
  166. return self._verified
  167. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  168. logger.info(f'Parsing {list_path}')
  169. with list_path.open('r') as f:
  170. for line in f.readlines():
  171. sections = line.strip().split(':')
  172. if len(sections) < 2 or len(sections) > 3:
  173. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  174. continue
  175. package = Package(
  176. sections[0],
  177. sections[1],
  178. sections[2] if len(sections) == 3 else None,
  179. )
  180. queue.put_nowait(package)
  181. async def download(package: Package, queue: asyncio.Queue) -> None:
  182. async with done_lock:
  183. skip = str(package) in done
  184. if skip:
  185. logger.info(f'{package}: Already downloaded. Skipping.')
  186. elif await package.verify():
  187. pom = await package.pom
  188. if pom:
  189. logger.info(f'{package}: Done')
  190. async with done_lock:
  191. for p in pom.packages:
  192. if not p.version:
  193. logger.warning(f'{p}: No version found!')
  194. logger.debug(f'{p}: Adding from BOM')
  195. done.add(str(p))
  196. else:
  197. logger.warning(f'{package}: No POM for package')
  198. else:
  199. logger.warning(f'{package}: Package not found. Check package name and internet connection')
  200. async def worker(queue: asyncio.Queue) -> None:
  201. while True:
  202. package = await queue.get()
  203. await download(package, queue)
  204. await asyncio.sleep(random.random())
  205. queue.task_done()
  206. async def main() -> None:
  207. queue: asyncio.Queue = asyncio.Queue()
  208. tasks = []
  209. load_package_list(Path('package-list.txt'), queue)
  210. logger.debug(f'Starting {num_workers} workers')
  211. for i in range(num_workers):
  212. tasks.append(
  213. asyncio.create_task(
  214. worker(queue)
  215. )
  216. )
  217. await queue.join()
  218. logger.debug('Queue is empty. Cancelling workers')
  219. for task in tasks:
  220. task.cancel()
  221. await asyncio.gather(*tasks, return_exceptions=True)
  222. logger.info('Generating list of all packages')
  223. async with done_lock:
  224. with open(output_path, 'w') as f:
  225. for p in done:
  226. f.write(p + '\n')
  227. logger = logging.getLogger(__name__)
  228. if __name__ == '__main__':
  229. parser = argparse.ArgumentParser()
  230. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  231. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  232. parser.add_argument('-o', '--output', type=Path, default=Path('full-package-list.txt'))
  233. args = parser.parse_args()
  234. if args.verbosity == 0:
  235. log_level = 'WARNING'
  236. elif args.verbosity == 1:
  237. log_level = 'INFO'
  238. else:
  239. log_level = 'DEBUG'
  240. logging.basicConfig(level=log_level)
  241. num_workers = args.workers
  242. output_path = args.output
  243. asyncio.run(main())