Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

375 рядки
12KB

  1. #!/bin/python3
  2. import re
  3. import argparse
  4. import logging
  5. import asyncio
  6. import subprocess
  7. import copy
  8. import aiohttp
  9. from pathlib import Path
  10. from xml.etree import ElementTree as ET
  11. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  12. ET.register_namespace('', ns[''])
  13. baseurl = 'https://search.maven.org'
  14. base_pom_path = Path('poms')
  15. mirrors = [
  16. "https://repo.maven.apache.org/maven2",
  17. "https://repo1.maven.org/maven2",
  18. "https://oss.sonatype.org/content/repositories/snapshots",
  19. "https://packages.confluent.io/maven",
  20. "https://registry.quarkus.io/maven",
  21. "https://plugins.gradle.org/m2",
  22. ]
  23. done: set[str] = set()
  24. done_lock = asyncio.Lock()
  25. num_workers = 50
  26. class PackagePOM:
  27. def __init__(self, package: 'Package', pom: str):
  28. logger.debug(f'{package}: Parsing POM')
  29. self.raw_root = ET.fromstring(pom)
  30. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  31. self.packaging = packaging.text
  32. else:
  33. self.packaging = '??'
  34. self.is_bom = self.packaging == 'pom'
  35. if self.packaging == 'pom':
  36. root_copy = copy.deepcopy(self.raw_root)
  37. depman = root_copy.find('dependencyManagement', ns)
  38. if depman is not None:
  39. root_copy.extend(depman.findall('*'))
  40. root_copy.remove(depman)
  41. tmpGroupId = f'tmp.{package.groupId}'
  42. tmpArtifactId = f'placeholder.{package.artifactId}'
  43. tmpVersion = package.version
  44. if (groupId := root_copy.find('groupId', ns)) is not None:
  45. groupId.text = tmpGroupId
  46. else:
  47. logger.info(f"{package}: Inserting new groupId tag in pom")
  48. ET.SubElement(root_copy, 'groupId').text = tmpGroupId
  49. if (artifactId := root_copy.find('artifactId', ns)) is not None:
  50. artifactId.text = tmpArtifactId
  51. else:
  52. logger.info(f"{package}: Inserting new artifactId tag in pom")
  53. ET.SubElement(root_copy, 'artifactId').text = tmpArtifactId
  54. if (version := root_copy.find('version', ns)) is not None:
  55. version.text = tmpVersion
  56. else:
  57. logger.info(f"{package}: Inserting new version tag in pom")
  58. ET.SubElement(root_copy, 'version').text = tmpVersion
  59. # Add a dependency for the pom itself
  60. if (dependencies := root_copy.find('dependencies', ns)) is not None:
  61. self_dep = ET.SubElement(dependencies, 'dependency')
  62. ET.SubElement(self_dep, 'groupId').text = package.groupId
  63. ET.SubElement(self_dep, 'artifactId').text = package.artifactId
  64. ET.SubElement(self_dep, 'version').text = package.version
  65. else:
  66. logger.warning(f"{package}: No dependencies tag in pom")
  67. self.generated_root = root_copy
  68. else:
  69. self.generated_root = ET.fromstring(
  70. f"""
  71. <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
  72. https://maven.apache.org/xsd/maven-4.0.0.xsd"
  73. xmlns="http://maven.apache.org/POM/4.0.0"
  74. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  75. <modelVersion>4.0.0</modelVersion>
  76. <groupId>tmp.{package.groupId}</groupId>
  77. <artifactId>placeholder-{package.artifactId}</artifactId>
  78. <version>{package.version}</version>
  79. <name>Package {package.artifactId}</name>
  80. <dependencies>
  81. <dependency>
  82. <groupId>{package.groupId}</groupId>
  83. <artifactId>{package.artifactId}</artifactId>
  84. <version>{package.version}</version>
  85. </dependency>
  86. </dependencies>
  87. </project>
  88. """
  89. )
  90. logger.debug(f'{package}: POM parsed')
  91. def write(self, f):
  92. tree = ET.ElementTree(self.generated_root)
  93. ET.indent(tree)
  94. tree.write(f)
  95. def get_property(self, prop: str):
  96. elem = self.raw_root.find(f'.//properties/{prop}', ns)
  97. if elem is not None:
  98. return elem.text
  99. else:
  100. return None
  101. def _package_from_xml_dep(self, dep: ET.Element):
  102. def prop_replace(match):
  103. prop = match.group(1)
  104. value = self.get_property(match.group(1))
  105. logger.debug(f'Replacing property {prop} with {value}')
  106. return value
  107. return Package(
  108. *[
  109. re.sub(
  110. r'\$\{([^\}]*)\}',
  111. prop_replace,
  112. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  113. )
  114. for tag in [
  115. 'groupId',
  116. 'artifactId',
  117. 'version',
  118. ]
  119. ]
  120. )
  121. @property
  122. def dependency_management(self) -> list['Package']:
  123. dependencies: list[Package] = []
  124. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  125. package = self._package_from_xml_dep(dep)
  126. dependencies.append(package)
  127. return dependencies
  128. class Package:
  129. _pom: PackagePOM | None = None
  130. _verified: bool = False
  131. def __init__(self, groupId: str, artifactId: str, version: str | None = None, implicit: bool = False):
  132. self.groupId = groupId
  133. self.artifactId = artifactId
  134. self.version = version
  135. self.implicit = implicit
  136. def __str__(self) -> str:
  137. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  138. def __eq__(self, other) -> bool:
  139. return (
  140. self.groupId == other.groupId
  141. and self.artifactId == other.artifactId
  142. and self.version == other.version
  143. )
  144. def __hash__(self) -> int:
  145. return hash((self.groupId, self.artifactId, self.version))
  146. @property
  147. def dir_path(self):
  148. group_path = self.groupId.replace(".", "/")
  149. return f'{group_path}/{self.artifactId}/{self.version}'
  150. @property
  151. def base_filename(self):
  152. return f'{self.artifactId}-{self.version}'
  153. async def download_file(self, extension):
  154. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  155. async with aiohttp.ClientSession() as session:
  156. for mirror in mirrors:
  157. pom_url = f'{mirror}/{filepath}'
  158. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  159. async with session.get(pom_url) as response:
  160. if response.status == 200:
  161. logger.debug(f'{self}: {extension} downloaded')
  162. return await response.text()
  163. break
  164. else:
  165. logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}')
  166. else:
  167. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  168. return None
  169. @property
  170. async def pom(self) -> PackagePOM:
  171. if self._pom is not None:
  172. return self._pom
  173. if self.version is None:
  174. await self._query_maven()
  175. self._pom = PackagePOM(self, await self.download_file('pom'))
  176. return self._pom
  177. @property
  178. def _urlquery(self) -> str:
  179. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  180. if self.version is not None:
  181. q += f'+AND+v:{self.version}'
  182. return q
  183. async def _query_maven(self) -> None:
  184. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  185. logger.debug(f'{self}: Querying maven at url {url}')
  186. async with aiohttp.ClientSession() as session:
  187. async with session.get(url) as response:
  188. if response.status == 200:
  189. message = await response.json()
  190. num = message['response']['numFound']
  191. if num:
  192. logger.debug(f'{self}: Query successful')
  193. self._verified = True
  194. if self.version is None:
  195. version = message['response']['docs'][0]['latestVersion']
  196. logger.debug(f'{self}: Using newest version {version}')
  197. self.version = version
  198. else:
  199. if self.implicit:
  200. logger.debug(f'{self}: No matching packages found')
  201. else:
  202. logger.warning(f'{self}: No matching packages found')
  203. self._verified = False
  204. else:
  205. self._verified = False
  206. logger.warning(f'{self}: HTTP error {response.status} downloading pom')
  207. async def verify(self) -> bool:
  208. if not self._verified:
  209. await self._query_maven()
  210. return self._verified
  211. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  212. logger.info(f'Parsing {list_path}')
  213. with list_path.open('r') as f:
  214. for line in f.readlines():
  215. sections = line.strip().split(':')
  216. if len(sections) < 2 or len(sections) > 3:
  217. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  218. continue
  219. package = Package(
  220. sections[0],
  221. sections[1],
  222. sections[2] if len(sections) == 3 else None,
  223. )
  224. queue.put_nowait(package)
  225. if not package.artifactId.endswith('-jvm'):
  226. queue.put_nowait(
  227. Package(
  228. package.groupId,
  229. f'{package.artifactId}-jvm',
  230. package.version,
  231. True,
  232. )
  233. )
  234. async def download(package: Package, queue: asyncio.Queue) -> None:
  235. async with done_lock:
  236. skip = str(package) in done
  237. if skip:
  238. logger.info(f'{package}: Already downloaded. Skipping.')
  239. elif await package.verify():
  240. async with done_lock:
  241. done.add(str(package))
  242. pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}'
  243. pom_path = pom_dir / 'pom.xml'
  244. pom_dir.mkdir(exist_ok=True)
  245. pom = await package.pom
  246. if not pom:
  247. return
  248. pom.write(pom_path)
  249. logger.info(f'{package}: Downloaded')
  250. if not pom.is_bom:
  251. for dep in pom.dependency_management:
  252. logger.info(f'{package}: Handling transitive dependency {dep}')
  253. await queue.put(dep)
  254. async def worker(queue: asyncio.Queue) -> None:
  255. while True:
  256. package = await queue.get()
  257. await download(package, queue)
  258. queue.task_done()
  259. async def main() -> None:
  260. queue: asyncio.Queue = asyncio.Queue()
  261. tasks = []
  262. load_package_list(Path('package-list.txt'), queue)
  263. logger.debug(f'Starting {num_workers} workers')
  264. for i in range(num_workers):
  265. tasks.append(
  266. asyncio.create_task(
  267. worker(queue)
  268. )
  269. )
  270. await queue.join()
  271. logger.debug('Queue is empty. Cancelling workers')
  272. for task in tasks:
  273. task.cancel()
  274. await asyncio.gather(*tasks, return_exceptions=True)
  275. logger.info('Generating master POM')
  276. subprocess.call(['sh', 'generate_master_pom.sh'])
  277. logger = logging.getLogger(__name__)
  278. if __name__ == '__main__':
  279. parser = argparse.ArgumentParser()
  280. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  281. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  282. args = parser.parse_args()
  283. if args.verbosity == 0:
  284. log_level = 'WARNING'
  285. elif args.verbosity == 1:
  286. log_level = 'INFO'
  287. else:
  288. log_level = 'DEBUG'
  289. logging.basicConfig(level=log_level)
  290. num_workers = args.workers
  291. asyncio.run(main())