Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

342 Zeilen
10KB

  1. #!/bin/python3
  2. import re
  3. import random
  4. import argparse
  5. import logging
  6. import asyncio
  7. import subprocess
  8. import copy
  9. import aiohttp
  10. from pathlib import Path
  11. from xml.etree import ElementTree as ET
  12. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  13. ET.register_namespace('', ns[''])
  14. baseurl = 'https://search.maven.org'
  15. output_path: Path = Path()
  16. mirrors = [
  17. "https://repo.maven.apache.org/maven2",
  18. "https://repo1.maven.org/maven2",
  19. "https://oss.sonatype.org/content/repositories/snapshots",
  20. "https://packages.confluent.io/maven",
  21. "https://registry.quarkus.io/maven",
  22. "https://plugins.gradle.org/m2",
  23. ]
  24. done: set[str] = set()
  25. done_lock = asyncio.Lock()
  26. num_workers = 50
  27. class TooManyRequestsException(Exception):
  28. pass
  29. class PackagePOM:
  30. def __init__(self, package: 'Package', pom: str):
  31. logger.debug(f'{package}: Parsing POM')
  32. self._package = package
  33. self.raw_root = ET.fromstring(pom)
  34. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  35. self.packaging = packaging.text
  36. else:
  37. self.packaging = '??'
  38. self.is_bom = self.packaging == 'pom'
  39. if self.packaging == 'pom':
  40. self.packages = [package, *self.dependency_management]
  41. else:
  42. self.packages = [package]
  43. logger.debug(f'{package}: POM parsed')
  44. def get_property(self, prop: str):
  45. elem = self.raw_root.find(f'.//properties/{prop}', ns)
  46. if elem is not None:
  47. return elem.text
  48. else:
  49. return None
  50. def _package_from_xml_dep(self, dep: ET.Element) -> 'Package':
  51. def lookup_prop(match) -> str:
  52. prop = match.group(1)
  53. if prop == 'project.groupId':
  54. value = str(self._package.groupId)
  55. elif prop == 'project.artifactId':
  56. value = str(self._package.artifactId)
  57. elif prop == 'project.version':
  58. value = str(self._package.version)
  59. else:
  60. value = prop_replace(self.get_property(prop))
  61. logger.debug(f'{self._package}: Replacing property {prop} with {value}')
  62. return value
  63. def prop_replace(text) -> str:
  64. return re.sub(
  65. r'\$\{([^\}]*)\}',
  66. lookup_prop,
  67. text,
  68. )
  69. return Package(
  70. *[
  71. prop_replace(
  72. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  73. )
  74. for tag in [
  75. 'groupId',
  76. 'artifactId',
  77. 'version',
  78. ]
  79. ]
  80. )
  81. @property
  82. def dependency_management(self) -> list['Package']:
  83. dependencies: list[Package] = []
  84. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  85. package = self._package_from_xml_dep(dep)
  86. dependencies.append(package)
  87. return dependencies
  88. class Package:
  89. _pom: PackagePOM | None = None
  90. _verified: bool = False
  91. def __init__(self, groupId: str, artifactId: str, version: str | None = None, implicit: bool = False):
  92. self.groupId = groupId
  93. self.artifactId = artifactId
  94. self.version = version
  95. self.implicit = implicit
  96. def __str__(self) -> str:
  97. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  98. def __eq__(self, other) -> bool:
  99. return (
  100. self.groupId == other.groupId
  101. and self.artifactId == other.artifactId
  102. and self.version == other.version
  103. )
  104. def __hash__(self) -> int:
  105. return hash((self.groupId, self.artifactId, self.version))
  106. @property
  107. def dir_path(self):
  108. group_path = self.groupId.replace(".", "/")
  109. return f'{group_path}/{self.artifactId}/{self.version}'
  110. @property
  111. def base_filename(self):
  112. return f'{self.artifactId}-{self.version}'
  113. async def download_file(self, extension):
  114. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  115. async with aiohttp.ClientSession() as session:
  116. for mirror in mirrors:
  117. pom_url = f'{mirror}/{filepath}'
  118. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  119. async with session.get(pom_url) as response:
  120. if response.status == 200:
  121. logger.debug(f'{self}: {extension} downloaded')
  122. return await response.text()
  123. break
  124. elif response.status == 429:
  125. raise TooManyRequestsException()
  126. else:
  127. logger.error(f'{self}: HTTP error {response.status} from mirror {mirror}')
  128. else:
  129. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  130. return None
  131. @property
  132. async def pom(self) -> PackagePOM:
  133. if self._pom is not None:
  134. return self._pom
  135. if self.version is None:
  136. await self._query_maven()
  137. self._pom = PackagePOM(self, await self.download_file('pom'))
  138. return self._pom
  139. @property
  140. def _urlquery(self) -> str:
  141. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  142. if self.version is not None:
  143. q += f'+AND+v:{self.version}'
  144. return q
  145. async def _query_maven(self) -> None:
  146. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  147. logger.debug(f'{self}: Querying maven at url {url}')
  148. async with aiohttp.ClientSession() as session:
  149. async with session.get(url) as response:
  150. if response.status == 200:
  151. message = await response.json()
  152. num = message['response']['numFound']
  153. if num:
  154. logger.debug(f'{self}: Query successful')
  155. self._verified = True
  156. if self.version is None:
  157. version = message['response']['docs'][0]['latestVersion']
  158. self.version = version
  159. logger.debug(f'{self}: Using newest version {version}')
  160. else:
  161. if self.implicit:
  162. logger.debug(f'{self}: No matching packages found')
  163. else:
  164. logger.warning(f'{self}: No matching packages found')
  165. self._verified = False
  166. elif response.status == 429:
  167. raise TooManyRequestsException()
  168. else:
  169. self._verified = False
  170. logger.error(f'{self}: HTTP error {response.status} downloading pom')
  171. async def verify(self) -> bool:
  172. if not self._verified:
  173. await self._query_maven()
  174. return self._verified
  175. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  176. logger.info(f'Parsing {list_path}')
  177. with list_path.open('r') as f:
  178. for line in f.readlines():
  179. sections = line.strip().split(':')
  180. if len(sections) < 2 or len(sections) > 3:
  181. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  182. continue
  183. package = Package(
  184. sections[0],
  185. sections[1],
  186. sections[2] if len(sections) == 3 else None,
  187. )
  188. queue.put_nowait(package)
  189. if not package.artifactId.endswith('-jvm'):
  190. queue.put_nowait(
  191. Package(
  192. package.groupId,
  193. f'{package.artifactId}-jvm',
  194. package.version,
  195. True,
  196. )
  197. )
  198. async def download(package: Package, queue: asyncio.Queue) -> None:
  199. async with done_lock:
  200. skip = str(package) in done
  201. if skip:
  202. logger.info(f'{package}: Already downloaded. Skipping.')
  203. elif await package.verify():
  204. pom = await package.pom
  205. if pom:
  206. logger.info(f'{package}: Done')
  207. async with done_lock:
  208. for p in pom.packages:
  209. if not p.version:
  210. logger.warning(f'{p}: No version found!')
  211. logger.debug(f'{p}: Adding from BOM')
  212. done.add(str(p))
  213. else:
  214. logger.warning(f'{package}: No POM for package')
  215. async def worker(queue: asyncio.Queue) -> None:
  216. while True:
  217. package = await queue.get()
  218. while True:
  219. try:
  220. await download(package, queue)
  221. break
  222. except TooManyRequestsException:
  223. logger.debug('Too many requests. Delaying next attempt')
  224. await asyncio.sleep(3*random.random())
  225. queue.task_done()
  226. async def main() -> None:
  227. queue: asyncio.Queue = asyncio.Queue()
  228. tasks = []
  229. load_package_list(Path('package-list.txt'), queue)
  230. logger.debug(f'Starting {num_workers} workers')
  231. for i in range(num_workers):
  232. tasks.append(
  233. asyncio.create_task(
  234. worker(queue)
  235. )
  236. )
  237. await queue.join()
  238. logger.debug('Queue is empty. Cancelling workers')
  239. for task in tasks:
  240. task.cancel()
  241. await asyncio.gather(*tasks, return_exceptions=True)
  242. logger.info('Generating list of all packages')
  243. async with done_lock:
  244. with open(output_path, 'w') as f:
  245. for p in sorted(done):
  246. f.write(p + '\n')
  247. logger = logging.getLogger(__name__)
  248. if __name__ == '__main__':
  249. parser = argparse.ArgumentParser()
  250. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  251. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  252. parser.add_argument('-o', '--output', type=Path, default=Path('full-package-list.txt'))
  253. args = parser.parse_args()
  254. if args.verbosity == 0:
  255. log_level = 'WARNING'
  256. elif args.verbosity == 1:
  257. log_level = 'INFO'
  258. else:
  259. log_level = 'DEBUG'
  260. logging.basicConfig(level=log_level)
  261. num_workers = args.workers
  262. output_path = args.output
  263. asyncio.run(main())