您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

363 行
11KB

  1. #!/bin/python3
  2. import re
  3. import argparse
  4. import logging
  5. import asyncio
  6. import subprocess
  7. import copy
  8. import aiohttp
  9. from pathlib import Path
  10. from xml.etree import ElementTree as ET
  11. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  12. ET.register_namespace('', ns[''])
  13. baseurl = 'https://search.maven.org'
  14. base_pom_path = Path('poms')
  15. mirrors = [
  16. "https://repo.maven.apache.org/maven2",
  17. "https://repo1.maven.org/maven2",
  18. "https://oss.sonatype.org/content/repositories/snapshots",
  19. "https://packages.confluent.io/maven",
  20. "https://registry.quarkus.io/maven",
  21. "https://plugins.gradle.org/m2",
  22. ]
  23. done: set[str] = set()
  24. done_lock = asyncio.Lock()
  25. num_workers = 50
  26. class PackagePOM:
  27. def __init__(self, package: 'Package', pom: str):
  28. logger.debug(f'{package}: Parsing POM')
  29. self.raw_root = ET.fromstring(pom)
  30. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  31. self.packaging = packaging.text
  32. else:
  33. self.packaging = '??'
  34. self.is_bom = self.packaging == 'pom'
  35. if self.packaging == 'pom':
  36. root_copy = copy.deepcopy(self.raw_root)
  37. depman = root_copy.find('dependencyManagement', ns)
  38. if depman is not None:
  39. root_copy.extend(depman.findall('*'))
  40. root_copy.remove(depman)
  41. if (groupId := root_copy.find('groupId', ns)) is not None:
  42. groupId.text = f'tmp.{package.groupId}'
  43. else:
  44. logger.warning(f"{package}: No groupId tag in pom")
  45. if (artifactId := root_copy.find('groupId', ns)) is not None:
  46. artifactId.text = f'placeholder.{package.artifactId}'
  47. else:
  48. logger.warning(f"{package}: No artifactId tag in pom")
  49. # Add a dependency for the pom itself
  50. if (dependencies := root_copy.find('dependencies', ns)) is not None:
  51. self_dep = ET.SubElement(dependencies, 'dependency')
  52. ET.SubElement(self_dep, 'groupId').text = package.groupId
  53. ET.SubElement(self_dep, 'artifactId').text = package.artifactId
  54. ET.SubElement(self_dep, 'version').text = package.version
  55. else:
  56. logger.warning(f"{package}: No dependencies tag in pom")
  57. self.generated_root = root_copy
  58. else:
  59. self.generated_root = ET.fromstring(
  60. f"""
  61. <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
  62. https://maven.apache.org/xsd/maven-4.0.0.xsd"
  63. xmlns="http://maven.apache.org/POM/4.0.0"
  64. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  65. <modelVersion>4.0.0</modelVersion>
  66. <groupId>tmp.{package.groupId}</groupId>
  67. <artifactId>placeholder-{package.artifactId}</artifactId>
  68. <version>{package.version}</version>
  69. <name>Package {package.artifactId}</name>
  70. <dependencies>
  71. <dependency>
  72. <groupId>{package.groupId}</groupId>
  73. <artifactId>{package.artifactId}</artifactId>
  74. <version>{package.version}</version>
  75. </dependency>
  76. </dependencies>
  77. </project>
  78. """
  79. )
  80. logger.debug(f'{package}: POM parsed')
  81. def write(self, f):
  82. tree = ET.ElementTree(self.generated_root)
  83. ET.indent(tree)
  84. tree.write(f)
  85. def get_property(self, prop: str):
  86. elem = self.raw_root.find(f'.//properties/{prop}', ns)
  87. if elem is not None:
  88. return elem.text
  89. else:
  90. return None
  91. def _package_from_xml_dep(self, dep: ET.Element):
  92. def prop_replace(match):
  93. prop = match.group(1)
  94. value = self.get_property(match.group(1))
  95. logger.debug(f'Replacing property {prop} with {value}')
  96. return value
  97. return Package(
  98. *[
  99. re.sub(
  100. r'\$\{([^\}]*)\}',
  101. prop_replace,
  102. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  103. )
  104. for tag in [
  105. 'groupId',
  106. 'artifactId',
  107. 'version',
  108. ]
  109. ]
  110. )
  111. @property
  112. def dependency_management(self) -> list['Package']:
  113. dependencies: list[Package] = []
  114. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  115. package = self._package_from_xml_dep(dep)
  116. dependencies.append(package)
  117. return dependencies
  118. class Package:
  119. _pom: PackagePOM | None = None
  120. _verified: bool = False
  121. def __init__(self, groupId: str, artifactId: str, version: str | None = None, implicit: bool = False):
  122. self.groupId = groupId
  123. self.artifactId = artifactId
  124. self.version = version
  125. self.implicit = implicit
  126. def __str__(self) -> str:
  127. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  128. def __eq__(self, other) -> bool:
  129. return (
  130. self.groupId == other.groupId
  131. and self.artifactId == other.artifactId
  132. and self.version == other.version
  133. )
  134. def __hash__(self) -> int:
  135. return hash((self.groupId, self.artifactId, self.version))
  136. @property
  137. def dir_path(self):
  138. group_path = self.groupId.replace(".", "/")
  139. return f'{group_path}/{self.artifactId}/{self.version}'
  140. @property
  141. def base_filename(self):
  142. return f'{self.artifactId}-{self.version}'
  143. async def download_file(self, extension):
  144. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  145. async with aiohttp.ClientSession() as session:
  146. for mirror in mirrors:
  147. pom_url = f'{mirror}/{filepath}'
  148. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  149. async with session.get(pom_url) as response:
  150. if response.status == 200:
  151. logger.debug(f'{self}: {extension} downloaded')
  152. return await response.text()
  153. break
  154. else:
  155. logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}')
  156. else:
  157. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  158. return None
  159. @property
  160. async def pom(self) -> PackagePOM:
  161. if self._pom is not None:
  162. return self._pom
  163. if self.version is None:
  164. await self._query_maven()
  165. self._pom = PackagePOM(self, await self.download_file('pom'))
  166. return self._pom
  167. @property
  168. def _urlquery(self) -> str:
  169. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  170. if self.version is not None:
  171. q += f'+AND+v:{self.version}'
  172. return q
  173. async def _query_maven(self) -> None:
  174. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  175. logger.debug(f'{self}: Querying maven at url {url}')
  176. async with aiohttp.ClientSession() as session:
  177. async with session.get(url) as response:
  178. if response.status == 200:
  179. message = await response.json()
  180. num = message['response']['numFound']
  181. if num:
  182. logger.debug(f'{self}: Query successful')
  183. self._verified = True
  184. if self.version is None:
  185. version = message['response']['docs'][0]['latestVersion']
  186. logger.debug(f'{self}: Using newest version {version}')
  187. self.version = version
  188. else:
  189. if self.implicit:
  190. logger.debug(f'{self}: No matching packages found')
  191. else:
  192. logger.warning(f'{self}: No matching packages found')
  193. self._verified = False
  194. else:
  195. self._verified = False
  196. logger.warning(f'{self}: HTTP error {response.status} downloading pom')
  197. async def verify(self) -> bool:
  198. if not self._verified:
  199. await self._query_maven()
  200. return self._verified
  201. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  202. logger.info(f'Parsing {list_path}')
  203. with list_path.open('r') as f:
  204. for line in f.readlines():
  205. sections = line.strip().split(':')
  206. if len(sections) < 2 or len(sections) > 3:
  207. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  208. continue
  209. package = Package(
  210. sections[0],
  211. sections[1],
  212. sections[2] if len(sections) == 3 else None,
  213. )
  214. queue.put_nowait(package)
  215. if not package.artifactId.endswith('-jvm'):
  216. queue.put_nowait(
  217. Package(
  218. package.groupId,
  219. f'{package.artifactId}-jvm',
  220. package.version,
  221. True,
  222. )
  223. )
  224. async def download(package: Package, queue: asyncio.Queue) -> None:
  225. async with done_lock:
  226. skip = str(package) in done
  227. if skip:
  228. logger.info(f'{package}: Already downloaded. Skipping.')
  229. elif await package.verify():
  230. async with done_lock:
  231. done.add(str(package))
  232. pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}'
  233. pom_path = pom_dir / 'pom.xml'
  234. pom_dir.mkdir(exist_ok=True)
  235. pom = await package.pom
  236. if not pom:
  237. return
  238. pom.write(pom_path)
  239. logger.info(f'{package}: Downloaded')
  240. if not pom.is_bom:
  241. for dep in pom.dependency_management:
  242. logger.info(f'{package}: Handling transitive dependency {dep}')
  243. await queue.put(dep)
  244. async def worker(queue: asyncio.Queue) -> None:
  245. while True:
  246. package = await queue.get()
  247. await download(package, queue)
  248. queue.task_done()
  249. async def main() -> None:
  250. queue: asyncio.Queue = asyncio.Queue()
  251. tasks = []
  252. load_package_list(Path('package-list.txt'), queue)
  253. logger.debug(f'Starting {num_workers} workers')
  254. for i in range(num_workers):
  255. tasks.append(
  256. asyncio.create_task(
  257. worker(queue)
  258. )
  259. )
  260. await queue.join()
  261. logger.debug('Queue is empty. Cancelling workers')
  262. for task in tasks:
  263. task.cancel()
  264. await asyncio.gather(*tasks, return_exceptions=True)
  265. logger.info('Generating master POM')
  266. subprocess.call(['sh', 'generate_master_pom.sh'])
  267. logger = logging.getLogger(__name__)
  268. if __name__ == '__main__':
  269. parser = argparse.ArgumentParser()
  270. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  271. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  272. args = parser.parse_args()
  273. if args.verbosity == 0:
  274. log_level = 'WARNING'
  275. elif args.verbosity == 1:
  276. log_level = 'INFO'
  277. else:
  278. log_level = 'DEBUG'
  279. logging.basicConfig(level=log_level)
  280. num_workers = args.workers
  281. asyncio.run(main())