25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.

408 satır
13KB

  1. #!/bin/python3
  2. import re
  3. import random
  4. import argparse
  5. import logging
  6. import asyncio
  7. import subprocess
  8. import copy
  9. import aiohttp
  10. from pathlib import Path
  11. from xml.etree import ElementTree as ET
  12. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  13. ET.register_namespace('', ns[''])
  14. baseurl = 'https://search.maven.org'
  15. base_pom_path = Path('poms')
  16. mirrors = [
  17. "https://repo.maven.apache.org/maven2",
  18. "https://repo1.maven.org/maven2",
  19. "https://oss.sonatype.org/content/repositories/snapshots",
  20. "https://packages.confluent.io/maven",
  21. "https://registry.quarkus.io/maven",
  22. "https://plugins.gradle.org/m2",
  23. ]
  24. done: set[str] = set()
  25. done_lock = asyncio.Lock()
  26. num_workers = 50
  27. class TooManyRequestsException(Exception):
  28. pass
  29. class PackagePOM:
  30. def __init__(self, package: 'Package', pom: str):
  31. self._package = package
  32. logger.debug(f'{package}: Parsing POM')
  33. self.raw_root = ET.fromstring(pom)
  34. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  35. self.packaging = packaging.text
  36. else:
  37. self.packaging = '??'
  38. self.is_bom = self.packaging == 'pom'
  39. if self.packaging == 'pom':
  40. root_copy = copy.deepcopy(self.raw_root)
  41. depman = root_copy.find('dependencyManagement', ns)
  42. if depman is not None:
  43. root_copy.extend(depman.findall('*'))
  44. root_copy.remove(depman)
  45. tmpGroupId = f'tmp.{package.groupId}'
  46. tmpArtifactId = f'placeholder.{package.artifactId}'
  47. tmpVersion = package.version
  48. if (groupId := root_copy.find('groupId', ns)) is not None:
  49. groupId.text = tmpGroupId
  50. else:
  51. logger.info(f"{package}: Inserting new groupId tag in pom")
  52. ET.SubElement(root_copy, 'groupId').text = tmpGroupId
  53. if (artifactId := root_copy.find('artifactId', ns)) is not None:
  54. artifactId.text = tmpArtifactId
  55. else:
  56. logger.info(f"{package}: Inserting new artifactId tag in pom")
  57. ET.SubElement(root_copy, 'artifactId').text = tmpArtifactId
  58. if (version := root_copy.find('version', ns)) is not None:
  59. version.text = tmpVersion
  60. else:
  61. logger.info(f"{package}: Inserting new version tag in pom")
  62. ET.SubElement(root_copy, 'version').text = tmpVersion
  63. # Add a dependency for the pom itself
  64. if (dependencies := root_copy.find('dependencies', ns)) is not None:
  65. self_dep = ET.SubElement(dependencies, 'dependency')
  66. ET.SubElement(self_dep, 'groupId').text = package.groupId
  67. ET.SubElement(self_dep, 'artifactId').text = package.artifactId
  68. ET.SubElement(self_dep, 'version').text = package.version
  69. else:
  70. logger.warning(f"{package}: No dependencies tag in pom")
  71. self.generated_root = root_copy
  72. else:
  73. self.generated_root = ET.fromstring(
  74. f"""
  75. <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
  76. https://maven.apache.org/xsd/maven-4.0.0.xsd"
  77. xmlns="http://maven.apache.org/POM/4.0.0"
  78. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  79. <modelVersion>4.0.0</modelVersion>
  80. <groupId>tmp.{package.groupId}</groupId>
  81. <artifactId>placeholder-{package.artifactId}</artifactId>
  82. <version>{package.version}</version>
  83. <name>Package {package.artifactId}</name>
  84. <dependencies>
  85. <dependency>
  86. <groupId>{package.groupId}</groupId>
  87. <artifactId>{package.artifactId}</artifactId>
  88. <version>{package.version}</version>
  89. </dependency>
  90. </dependencies>
  91. </project>
  92. """
  93. )
  94. logger.debug(f'{package}: POM parsed')
  95. def write(self, f):
  96. tree = ET.ElementTree(self.generated_root)
  97. ET.indent(tree)
  98. tree.write(f)
  99. def get_property(self, prop: str):
  100. elem = self.raw_root.find(f'.//properties/{prop}', ns)
  101. if elem is not None:
  102. return elem.text
  103. else:
  104. return None
  105. def _package_from_xml_dep(self, dep: ET.Element) -> 'Package':
  106. def lookup_prop(match) -> str:
  107. prop = match.group(1)
  108. if prop == 'project.groupId':
  109. value = str(self._package.groupId)
  110. elif prop == 'project.artifactId':
  111. value = str(self._package.artifactId)
  112. elif prop == 'project.version':
  113. value = str(self._package.version)
  114. else:
  115. value = self.get_property(prop)
  116. logger.debug(f'{self._package}: Trying to recurse prop {value}')
  117. value = prop_replace(value)
  118. logger.debug(f'{self._package}: Replacing property {prop} with {value}')
  119. return value
  120. def prop_replace(text) -> str:
  121. logger.debug(f'{self._package}: Getting prop {text}')
  122. return re.sub(
  123. r'\$\{([^\}]*)\}',
  124. lookup_prop,
  125. text,
  126. )
  127. def prop_replace_tag(tag) -> str:
  128. return prop_replace(
  129. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  130. )
  131. return Package(
  132. groupId=prop_replace_tag('groupId'),
  133. artifactId=prop_replace_tag('artifactId'),
  134. version=prop_replace_tag('version'),
  135. )
  136. @property
  137. def dependency_management(self) -> list['Package']:
  138. dependencies: list[Package] = []
  139. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  140. package = self._package_from_xml_dep(dep)
  141. dependencies.append(package)
  142. return dependencies
  143. class Package:
  144. _pom: PackagePOM | None = None
  145. _verified: bool = False
  146. def __init__(self, groupId: str, artifactId: str, version: str | None = None, implicit: bool = False):
  147. self.groupId = groupId
  148. self.artifactId = artifactId
  149. self.version = version
  150. self.implicit = implicit
  151. def __str__(self) -> str:
  152. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  153. def __eq__(self, other) -> bool:
  154. return (
  155. self.groupId == other.groupId
  156. and self.artifactId == other.artifactId
  157. and self.version == other.version
  158. )
  159. def __hash__(self) -> int:
  160. return hash((self.groupId, self.artifactId, self.version))
  161. @property
  162. def dir_path(self):
  163. group_path = self.groupId.replace(".", "/")
  164. return f'{group_path}/{self.artifactId}/{self.version}'
  165. @property
  166. def base_filename(self):
  167. return f'{self.artifactId}-{self.version}'
  168. async def download_file(self, extension):
  169. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  170. async with aiohttp.ClientSession() as session:
  171. for mirror in mirrors:
  172. pom_url = f'{mirror}/{filepath}'
  173. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  174. async with session.get(pom_url) as response:
  175. if response.status == 200:
  176. logger.debug(f'{self}: {extension} downloaded')
  177. return await response.text()
  178. break
  179. elif response.status == 429:
  180. raise TooManyRequestsException()
  181. else:
  182. logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}')
  183. else:
  184. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  185. return None
  186. @property
  187. async def pom(self) -> PackagePOM:
  188. if self._pom is not None:
  189. return self._pom
  190. if self.version is None:
  191. await self._query_maven()
  192. self._pom = PackagePOM(self, await self.download_file('pom'))
  193. return self._pom
  194. @property
  195. def _urlquery(self) -> str:
  196. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  197. if self.version is not None:
  198. q += f'+AND+v:{self.version}'
  199. return q
  200. async def _query_maven(self) -> None:
  201. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  202. logger.debug(f'{self}: Querying maven at url {url}')
  203. async with aiohttp.ClientSession() as session:
  204. async with session.get(url) as response:
  205. if response.status == 200:
  206. message = await response.json()
  207. num = message['response']['numFound']
  208. if num:
  209. logger.debug(f'{self}: Query successful')
  210. self._verified = True
  211. if self.version is None:
  212. version = message['response']['docs'][0]['latestVersion']
  213. logger.debug(f'{self}: Using newest version {version}')
  214. self.version = version
  215. else:
  216. if self.implicit:
  217. logger.debug(f'{self}: No matching packages found')
  218. else:
  219. logger.warning(f'{self}: No matching packages found')
  220. self._verified = False
  221. elif response.status == 429:
  222. raise TooManyRequestsException()
  223. else:
  224. self._verified = False
  225. logger.warning(f'{self}: HTTP error {response.status} downloading pom')
  226. async def verify(self) -> bool:
  227. if not self._verified:
  228. await self._query_maven()
  229. return self._verified
  230. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  231. logger.info(f'Parsing {list_path}')
  232. with list_path.open('r') as f:
  233. for line in f.readlines():
  234. sections = line.strip().split(':')
  235. if len(sections) < 2 or len(sections) > 3:
  236. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  237. continue
  238. package = Package(
  239. sections[0],
  240. sections[1],
  241. sections[2] if len(sections) == 3 else None,
  242. )
  243. queue.put_nowait(package)
  244. if not package.artifactId.endswith('-jvm'):
  245. queue.put_nowait(
  246. Package(
  247. package.groupId,
  248. f'{package.artifactId}-jvm',
  249. package.version,
  250. True,
  251. )
  252. )
  253. async def download(package: Package, queue: asyncio.Queue) -> None:
  254. async with done_lock:
  255. skip = str(package) in done
  256. if skip:
  257. logger.info(f'{package}: Already downloaded. Skipping.')
  258. elif await package.verify():
  259. async with done_lock:
  260. done.add(str(package))
  261. pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}'
  262. pom_path = pom_dir / 'pom.xml'
  263. pom_dir.mkdir(exist_ok=True)
  264. pom = await package.pom
  265. if not pom:
  266. return
  267. pom.write(pom_path)
  268. logger.info(f'{package}: Downloaded')
  269. if not pom.is_bom:
  270. for dep in pom.dependency_management:
  271. logger.info(f'{package}: Handling transitive dependency {dep}')
  272. await queue.put(dep)
  273. async def worker(queue: asyncio.Queue) -> None:
  274. while True:
  275. package = await queue.get()
  276. while True:
  277. try:
  278. await download(package, queue)
  279. break
  280. except TooManyRequestsException:
  281. logger.info('Too many requests. Delaying next attempt')
  282. await asyncio.sleep(3*random.random() + 0.2)
  283. queue.task_done()
  284. async def main() -> None:
  285. queue: asyncio.Queue = asyncio.Queue()
  286. tasks = []
  287. load_package_list(Path('package-list.txt'), queue)
  288. logger.debug(f'Starting {num_workers} workers')
  289. for i in range(num_workers):
  290. tasks.append(
  291. asyncio.create_task(
  292. worker(queue)
  293. )
  294. )
  295. await queue.join()
  296. logger.debug('Queue is empty. Cancelling workers')
  297. for task in tasks:
  298. task.cancel()
  299. await asyncio.gather(*tasks, return_exceptions=True)
  300. logger.info('Generating master POM')
  301. subprocess.call(['sh', 'generate_master_pom.sh'])
  302. logger = logging.getLogger(__name__)
  303. if __name__ == '__main__':
  304. parser = argparse.ArgumentParser()
  305. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  306. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  307. args = parser.parse_args()
  308. if args.verbosity == 0:
  309. log_level = 'WARNING'
  310. elif args.verbosity == 1:
  311. log_level = 'INFO'
  312. else:
  313. log_level = 'DEBUG'
  314. logging.basicConfig(level=log_level)
  315. num_workers = args.workers
  316. asyncio.run(main())