Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

392 lignes
13KB

  1. #!/bin/python3
  2. import re
  3. import random
  4. import argparse
  5. import logging
  6. import asyncio
  7. import subprocess
  8. import copy
  9. import aiohttp
  10. from pathlib import Path
  11. from xml.etree import ElementTree as ET
  12. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  13. ET.register_namespace('', ns[''])
  14. baseurl = 'https://search.maven.org'
  15. base_pom_path = Path('poms')
  16. mirrors = [
  17. "https://repo.maven.apache.org/maven2",
  18. "https://repo1.maven.org/maven2",
  19. "https://oss.sonatype.org/content/repositories/snapshots",
  20. "https://packages.confluent.io/maven",
  21. "https://registry.quarkus.io/maven",
  22. "https://plugins.gradle.org/m2",
  23. ]
  24. done: set[str] = set()
  25. done_lock = asyncio.Lock()
  26. num_workers = 50
  27. class TooManyRequestsException(Exception):
  28. pass
  29. class PackagePOM:
  30. def __init__(self, package: 'Package', pom: str):
  31. logger.debug(f'{package}: Parsing POM')
  32. self.raw_root = ET.fromstring(pom)
  33. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  34. self.packaging = packaging.text
  35. else:
  36. self.packaging = '??'
  37. self.is_bom = self.packaging == 'pom'
  38. if self.packaging == 'pom':
  39. root_copy = copy.deepcopy(self.raw_root)
  40. depman = root_copy.find('dependencyManagement', ns)
  41. if depman is not None:
  42. root_copy.extend(depman.findall('*'))
  43. root_copy.remove(depman)
  44. tmpGroupId = f'tmp.{package.groupId}'
  45. tmpArtifactId = f'placeholder.{package.artifactId}'
  46. tmpVersion = package.version
  47. if (groupId := root_copy.find('groupId', ns)) is not None:
  48. groupId.text = tmpGroupId
  49. else:
  50. logger.info(f"{package}: Inserting new groupId tag in pom")
  51. ET.SubElement(root_copy, 'groupId').text = tmpGroupId
  52. if (artifactId := root_copy.find('artifactId', ns)) is not None:
  53. artifactId.text = tmpArtifactId
  54. else:
  55. logger.info(f"{package}: Inserting new artifactId tag in pom")
  56. ET.SubElement(root_copy, 'artifactId').text = tmpArtifactId
  57. if (version := root_copy.find('version', ns)) is not None:
  58. version.text = tmpVersion
  59. else:
  60. logger.info(f"{package}: Inserting new version tag in pom")
  61. ET.SubElement(root_copy, 'version').text = tmpVersion
  62. # Add a dependency for the pom itself
  63. if (dependencies := root_copy.find('dependencies', ns)) is not None:
  64. self_dep = ET.SubElement(dependencies, 'dependency')
  65. ET.SubElement(self_dep, 'groupId').text = package.groupId
  66. ET.SubElement(self_dep, 'artifactId').text = package.artifactId
  67. ET.SubElement(self_dep, 'version').text = package.version
  68. else:
  69. logger.warning(f"{package}: No dependencies tag in pom")
  70. self.generated_root = root_copy
  71. else:
  72. self.generated_root = ET.fromstring(
  73. f"""
  74. <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
  75. https://maven.apache.org/xsd/maven-4.0.0.xsd"
  76. xmlns="http://maven.apache.org/POM/4.0.0"
  77. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  78. <modelVersion>4.0.0</modelVersion>
  79. <groupId>tmp.{package.groupId}</groupId>
  80. <artifactId>placeholder-{package.artifactId}</artifactId>
  81. <version>{package.version}</version>
  82. <name>Package {package.artifactId}</name>
  83. <dependencies>
  84. <dependency>
  85. <groupId>{package.groupId}</groupId>
  86. <artifactId>{package.artifactId}</artifactId>
  87. <version>{package.version}</version>
  88. </dependency>
  89. </dependencies>
  90. </project>
  91. """
  92. )
  93. logger.debug(f'{package}: POM parsed')
  94. def write(self, f):
  95. tree = ET.ElementTree(self.generated_root)
  96. ET.indent(tree)
  97. tree.write(f)
  98. def get_property(self, prop: str):
  99. elem = self.raw_root.find(f'.//properties/{prop}', ns)
  100. if elem is not None:
  101. return elem.text
  102. else:
  103. return None
  104. def _package_from_xml_dep(self, dep: ET.Element):
  105. def prop_replace(match):
  106. prop = match.group(1)
  107. value = self.get_property(match.group(1))
  108. logger.debug(f'Replacing property {prop} with {value}')
  109. return value
  110. return Package(
  111. *[
  112. re.sub(
  113. r'\$\{([^\}]*)\}',
  114. prop_replace,
  115. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  116. )
  117. for tag in [
  118. 'groupId',
  119. 'artifactId',
  120. 'version',
  121. ]
  122. ]
  123. )
  124. @property
  125. def dependency_management(self) -> list['Package']:
  126. dependencies: list[Package] = []
  127. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  128. package = self._package_from_xml_dep(dep)
  129. dependencies.append(package)
  130. return dependencies
  131. class Package:
  132. _pom: PackagePOM | None = None
  133. _verified: bool = False
  134. def __init__(self, groupId: str, artifactId: str, version: str | None = None, implicit: bool = False):
  135. self.groupId = groupId
  136. self.artifactId = artifactId
  137. self.version = version
  138. self.implicit = implicit
  139. def __str__(self) -> str:
  140. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  141. def __eq__(self, other) -> bool:
  142. return (
  143. self.groupId == other.groupId
  144. and self.artifactId == other.artifactId
  145. and self.version == other.version
  146. )
  147. def __hash__(self) -> int:
  148. return hash((self.groupId, self.artifactId, self.version))
  149. @property
  150. def dir_path(self):
  151. group_path = self.groupId.replace(".", "/")
  152. return f'{group_path}/{self.artifactId}/{self.version}'
  153. @property
  154. def base_filename(self):
  155. return f'{self.artifactId}-{self.version}'
  156. async def download_file(self, extension):
  157. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  158. async with aiohttp.ClientSession() as session:
  159. for mirror in mirrors:
  160. pom_url = f'{mirror}/{filepath}'
  161. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  162. async with session.get(pom_url) as response:
  163. if response.status == 200:
  164. logger.debug(f'{self}: {extension} downloaded')
  165. return await response.text()
  166. break
  167. elif response.status == 429:
  168. raise TooManyRequestsException()
  169. else:
  170. logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}')
  171. else:
  172. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  173. return None
  174. @property
  175. async def pom(self) -> PackagePOM:
  176. if self._pom is not None:
  177. return self._pom
  178. if self.version is None:
  179. await self._query_maven()
  180. self._pom = PackagePOM(self, await self.download_file('pom'))
  181. return self._pom
  182. @property
  183. def _urlquery(self) -> str:
  184. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  185. if self.version is not None:
  186. q += f'+AND+v:{self.version}'
  187. return q
  188. async def _query_maven(self) -> None:
  189. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  190. logger.debug(f'{self}: Querying maven at url {url}')
  191. async with aiohttp.ClientSession() as session:
  192. async with session.get(url) as response:
  193. if response.status == 200:
  194. message = await response.json()
  195. num = message['response']['numFound']
  196. if num:
  197. logger.debug(f'{self}: Query successful')
  198. self._verified = True
  199. if self.version is None:
  200. version = message['response']['docs'][0]['latestVersion']
  201. logger.debug(f'{self}: Using newest version {version}')
  202. self.version = version
  203. else:
  204. if self.implicit:
  205. logger.debug(f'{self}: No matching packages found')
  206. else:
  207. logger.warning(f'{self}: No matching packages found')
  208. self._verified = False
  209. elif response.status == 429:
  210. raise TooManyRequestsException()
  211. else:
  212. self._verified = False
  213. logger.warning(f'{self}: HTTP error {response.status} downloading pom')
  214. async def verify(self) -> bool:
  215. if not self._verified:
  216. await self._query_maven()
  217. return self._verified
  218. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  219. logger.info(f'Parsing {list_path}')
  220. with list_path.open('r') as f:
  221. for line in f.readlines():
  222. sections = line.strip().split(':')
  223. if len(sections) < 2 or len(sections) > 3:
  224. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  225. continue
  226. package = Package(
  227. sections[0],
  228. sections[1],
  229. sections[2] if len(sections) == 3 else None,
  230. )
  231. queue.put_nowait(package)
  232. if not package.artifactId.endswith('-jvm'):
  233. queue.put_nowait(
  234. Package(
  235. package.groupId,
  236. f'{package.artifactId}-jvm',
  237. package.version,
  238. True,
  239. )
  240. )
  241. async def download(package: Package, queue: asyncio.Queue) -> None:
  242. async with done_lock:
  243. skip = str(package) in done
  244. if skip:
  245. logger.info(f'{package}: Already downloaded. Skipping.')
  246. elif await package.verify():
  247. async with done_lock:
  248. done.add(str(package))
  249. pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}'
  250. pom_path = pom_dir / 'pom.xml'
  251. pom_dir.mkdir(exist_ok=True)
  252. pom = await package.pom
  253. if not pom:
  254. return
  255. pom.write(pom_path)
  256. logger.info(f'{package}: Downloaded')
  257. if not pom.is_bom:
  258. for dep in pom.dependency_management:
  259. logger.info(f'{package}: Handling transitive dependency {dep}')
  260. await queue.put(dep)
  261. async def worker(queue: asyncio.Queue) -> None:
  262. while True:
  263. package = await queue.get()
  264. while True:
  265. try:
  266. await download(package, queue)
  267. break
  268. except TooManyRequestsException:
  269. logger.info('Too many requests. Delaying next attempt')
  270. await asyncio.sleep(3*random.random() + 0.2)
  271. queue.task_done()
  272. async def main() -> None:
  273. queue: asyncio.Queue = asyncio.Queue()
  274. tasks = []
  275. load_package_list(Path('package-list.txt'), queue)
  276. logger.debug(f'Starting {num_workers} workers')
  277. for i in range(num_workers):
  278. tasks.append(
  279. asyncio.create_task(
  280. worker(queue)
  281. )
  282. )
  283. await queue.join()
  284. logger.debug('Queue is empty. Cancelling workers')
  285. for task in tasks:
  286. task.cancel()
  287. await asyncio.gather(*tasks, return_exceptions=True)
  288. logger.info('Generating master POM')
  289. subprocess.call(['sh', 'generate_master_pom.sh'])
  290. logger = logging.getLogger(__name__)
  291. if __name__ == '__main__':
  292. parser = argparse.ArgumentParser()
  293. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  294. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  295. args = parser.parse_args()
  296. if args.verbosity == 0:
  297. log_level = 'WARNING'
  298. elif args.verbosity == 1:
  299. log_level = 'INFO'
  300. else:
  301. log_level = 'DEBUG'
  302. logging.basicConfig(level=log_level)
  303. num_workers = args.workers
  304. asyncio.run(main())