Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

525 wiersze
18KB

  1. #!/bin/python3
  2. import re
  3. import copy
  4. import random
  5. import argparse
  6. import logging
  7. import asyncio
  8. import subprocess
  9. import copy
  10. import aiohttp
  11. from pathlib import Path
  12. from xml.etree import ElementTree as ET
  13. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  14. ET.register_namespace('', ns[''])
  15. baseurl = 'https://search.maven.org'
  16. base_pom_path = Path('poms')
  17. mirrors = [
  18. "https://repo.maven.apache.org/maven2",
  19. "https://repo1.maven.org/maven2",
  20. "https://oss.sonatype.org/content/repositories/snapshots",
  21. "https://packages.confluent.io/maven",
  22. "https://registry.quarkus.io/maven",
  23. "https://plugins.gradle.org/m2",
  24. ]
  25. done: set[str] = set()
  26. done_lock = asyncio.Lock()
  27. in_progress: set[str] = set()
  28. in_progress_lock = asyncio.Lock()
  29. num_workers = 50
  30. global_properties: dict[str, dict[str, str]] = {}
  31. class TooManyRequestsException(Exception):
  32. pass
  33. class PackageError(Exception):
  34. pass
  35. class WaitForPackage(Exception):
  36. def __init__(self, package):
  37. self.package = package
  38. class PackagePOM:
  39. def __init__(self, package: 'Package', pom: str):
  40. self._package = package
  41. logger.debug(f'{package}: Parsing POM')
  42. self.raw_root = ET.fromstring(pom)
  43. self.parent: Package | None = None
  44. if (parent_tag := self.raw_root.find('parent', ns)) is not None:
  45. parent_group_tag = parent_tag.find('groupId', ns)
  46. parent_artifact_tag = parent_tag.find('artifactId', ns)
  47. parent_version_tag = parent_tag.find('version', ns)
  48. parent_group = parent_group_tag.text if parent_group_tag is not None else None
  49. parent_artifact = parent_artifact_tag.text if parent_artifact_tag is not None else None
  50. parent_version = parent_version_tag.text if parent_version_tag is not None else None
  51. logger.debug(f'{package}: Parsing parent {parent_group}:{parent_artifact}:{parent_version}')
  52. if parent_group is not None and parent_artifact is not None and parent_version is not None:
  53. parent = Package(
  54. parent_group,
  55. parent_artifact,
  56. parent_version,
  57. )
  58. if str(parent) in done:
  59. self.parent = parent
  60. else:
  61. raise WaitForPackage(parent)
  62. else:
  63. raise PackageError(f'Invalid parent {parent_group}:{parent_artifact}:{parent_version}')
  64. logger.debug(f'{package}: Parsing properties')
  65. parent_props: dict[str, str] = {} if self.parent is None else global_properties[str(self.parent)]
  66. self.properties = self.resolve_props(parent_props)
  67. global_properties[str(package)] = self.properties
  68. logger.debug(f'{package}: Parsing packaging')
  69. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  70. self.packaging = packaging.text
  71. else:
  72. self.packaging = '??'
  73. self.is_bom = self.packaging == 'pom'
  74. if self.packaging == 'pom':
  75. root_copy = copy.deepcopy(self.raw_root)
  76. dependencies = root_copy.find('dependencies', ns) or ET.SubElement(root_copy, 'dependencies')
  77. depman = root_copy.find('dependencyManagement', ns)
  78. if depman is not None:
  79. tmp_deps = depman.findall('dependencies/*', ns)
  80. dependencies.extend(tmp_deps)
  81. root_copy.remove(depman)
  82. tmpGroupId = f'tmp.{package.groupId}'
  83. tmpArtifactId = f'placeholder.{package.artifactId}'
  84. tmpVersion = package.version
  85. if (groupId := root_copy.find('groupId', ns)) is not None:
  86. groupId.text = tmpGroupId
  87. else:
  88. logger.info(f"{package}: Inserting new groupId tag in pom")
  89. ET.SubElement(root_copy, 'groupId').text = tmpGroupId
  90. if (artifactId := root_copy.find('artifactId', ns)) is not None:
  91. artifactId.text = tmpArtifactId
  92. else:
  93. logger.info(f"{package}: Inserting new artifactId tag in pom")
  94. ET.SubElement(root_copy, 'artifactId').text = tmpArtifactId
  95. if (version := root_copy.find('version', ns)) is not None:
  96. version.text = tmpVersion
  97. else:
  98. logger.info(f"{package}: Inserting new version tag in pom")
  99. ET.SubElement(root_copy, 'version').text = tmpVersion
  100. # Add a dependency for the pom itself
  101. self_dep = ET.SubElement(dependencies, 'dependency')
  102. ET.SubElement(self_dep, 'groupId').text = package.groupId
  103. ET.SubElement(self_dep, 'artifactId').text = package.artifactId
  104. ET.SubElement(self_dep, 'version').text = package.version
  105. self.generated_root = root_copy
  106. else:
  107. self.generated_root = ET.fromstring(
  108. f"""
  109. <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
  110. https://maven.apache.org/xsd/maven-4.0.0.xsd"
  111. xmlns="http://maven.apache.org/POM/4.0.0"
  112. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  113. <modelVersion>4.0.0</modelVersion>
  114. <groupId>tmp.{package.groupId}</groupId>
  115. <artifactId>placeholder-{package.artifactId}</artifactId>
  116. <version>{package.version}</version>
  117. <name>Package {package.artifactId}</name>
  118. <dependencies>
  119. <dependency>
  120. <groupId>{package.groupId}</groupId>
  121. <artifactId>{package.artifactId}</artifactId>
  122. <version>{package.version}</version>
  123. </dependency>
  124. </dependencies>
  125. </project>
  126. """
  127. )
  128. logger.debug(f'{package}: POM parsed')
  129. def write(self, f):
  130. tree = ET.ElementTree(self.generated_root)
  131. ET.indent(tree)
  132. tree.write(f)
  133. def resolve_props(self, initial: dict[str, str]):
  134. props = initial
  135. for prop_tag in self.raw_root.findall('.//properties/*', ns):
  136. prop = prop_tag.tag.replace(f'{{{ns[""]}}}', '')
  137. value = prop_tag.text if prop_tag.text is not None else ''
  138. logger.debug(f'{self._package}: Setting prop {prop}={value}')
  139. props[prop] = value
  140. changed = True
  141. while changed:
  142. changed = False
  143. for prop, value in props.items():
  144. new_value = self.prop_replace(value, props)
  145. if new_value != value:
  146. changed = True
  147. logger.debug(f'{self._package}: Setting prop {prop}={new_value}')
  148. props[prop] = new_value
  149. return props
  150. def prop_replace(self, text, props: dict[str, str] | None = None) -> str:
  151. def lookup_prop(match) -> str:
  152. prop = match.group(1)
  153. if prop == 'project.groupId':
  154. value = str(self._package.groupId)
  155. elif prop == 'project.artifactId':
  156. value = str(self._package.artifactId)
  157. elif prop == 'project.version':
  158. value = str(self._package.version)
  159. elif prop.startswith('project.build') or prop.startswith('env.') or prop.startswith('maven.'):
  160. value = ''
  161. else:
  162. try:
  163. value = props[prop] if props is not None else self.properties[prop]
  164. except KeyError:
  165. logger.error(f'{self._package}: Could not find property {prop}. Setting it to ""')
  166. value = ''
  167. logger.debug(f'{self._package}: Replacing property {prop} with {value}')
  168. return value
  169. return re.sub(
  170. r'\$\{([^\}]*)\}',
  171. lookup_prop,
  172. text,
  173. )
  174. def _package_from_xml_dep(self, dep: ET.Element) -> 'Package':
  175. def prop_replace_tag(tag) -> str:
  176. return self.prop_replace(
  177. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  178. )
  179. return Package(
  180. groupId=prop_replace_tag('groupId'),
  181. artifactId=prop_replace_tag('artifactId'),
  182. version=prop_replace_tag('version'),
  183. )
  184. @property
  185. def dependency_management(self) -> list['Package']:
  186. dependencies: list[Package] = []
  187. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  188. package = self._package_from_xml_dep(dep)
  189. dependencies.append(package)
  190. return dependencies
  191. class Package:
  192. _pom: PackagePOM | None = None
  193. _verified: bool = False
  194. def __init__(self, groupId: str, artifactId: str, version: str | None = None, implicit: bool = False):
  195. self.groupId = groupId
  196. self.artifactId = artifactId
  197. self.version = version
  198. self.implicit = implicit
  199. def __str__(self) -> str:
  200. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  201. def __eq__(self, other) -> bool:
  202. return (
  203. self.groupId == other.groupId
  204. and self.artifactId == other.artifactId
  205. and self.version == other.version
  206. )
  207. def __hash__(self) -> int:
  208. return hash((self.groupId, self.artifactId, self.version))
  209. @property
  210. def dir_path(self):
  211. group_path = self.groupId.replace(".", "/")
  212. return f'{group_path}/{self.artifactId}/{self.version}'
  213. @property
  214. def base_filename(self):
  215. return f'{self.artifactId}-{self.version}'
  216. async def download_file(self, extension):
  217. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  218. async with aiohttp.ClientSession() as session:
  219. for mirror in mirrors:
  220. pom_url = f'{mirror}/{filepath}'
  221. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  222. async with session.get(pom_url) as response:
  223. if response.status == 200:
  224. logger.debug(f'{self}: {extension} downloaded')
  225. return await response.text()
  226. break
  227. elif response.status == 429:
  228. raise TooManyRequestsException()
  229. else:
  230. logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}')
  231. else:
  232. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  233. return None
  234. @property
  235. async def pom(self) -> PackagePOM:
  236. if self._pom is not None:
  237. return self._pom
  238. if self.version is None:
  239. await self._query_maven()
  240. self._pom = PackagePOM(self, await self.download_file('pom'))
  241. return self._pom
  242. @property
  243. def _urlquery(self) -> str:
  244. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  245. if self.version is not None:
  246. q += f'+AND+v:{self.version}'
  247. return q
  248. async def _query_maven(self) -> None:
  249. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  250. logger.debug(f'{self}: Querying maven at url {url}')
  251. async with aiohttp.ClientSession() as session:
  252. async with session.get(url) as response:
  253. if response.status == 200:
  254. message = await response.json()
  255. num = message['response']['numFound']
  256. if num:
  257. logger.debug(f'{self}: Query successful')
  258. self._verified = True
  259. if self.version is None:
  260. version = message['response']['docs'][0]['latestVersion']
  261. logger.debug(f'{self}: Using newest version {version}')
  262. self.version = version
  263. else:
  264. if self.implicit:
  265. logger.debug(f'{self}: No matching packages found')
  266. else:
  267. logger.warning(f'{self}: No matching packages found')
  268. self._verified = False
  269. elif response.status == 429:
  270. raise TooManyRequestsException()
  271. else:
  272. self._verified = False
  273. logger.warning(f'{self}: HTTP error {response.status} downloading pom')
  274. async def verify(self) -> bool:
  275. if not self._verified:
  276. await self._query_maven()
  277. return self._verified
  278. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  279. logger.info(f'Parsing {list_path}')
  280. with list_path.open('r') as f:
  281. for line in f.readlines():
  282. sections = line.strip().split(':')
  283. if len(sections) < 2 or len(sections) > 3:
  284. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  285. continue
  286. package = Package(
  287. sections[0],
  288. sections[1],
  289. sections[2] if len(sections) == 3 else None,
  290. )
  291. queue.put_nowait(package)
  292. if not package.artifactId.endswith('-jvm'):
  293. queue.put_nowait(
  294. Package(
  295. package.groupId,
  296. f'{package.artifactId}-jvm',
  297. package.version,
  298. True,
  299. )
  300. )
  301. async def download(package: Package, queue: asyncio.Queue) -> None:
  302. async with done_lock:
  303. is_done = str(package) in done
  304. async with in_progress_lock:
  305. is_in_progress = str(package) in in_progress
  306. if is_done:
  307. logger.info(f'{package}: Already downloaded. Skipping.')
  308. elif is_in_progress:
  309. logger.info(f'{package}: Already in progress. Skipping.')
  310. else:
  311. async with in_progress_lock:
  312. in_progress.add(str(package))
  313. for _ in range(50):
  314. try:
  315. verified = await package.verify()
  316. break
  317. except TooManyRequestsException:
  318. logger.info('Too many requests. Delaying next attempt')
  319. await asyncio.sleep(3*random.random() + 0.2)
  320. else:
  321. logger.error(f'{package}: Verification failed after 50 tries')
  322. exit(1)
  323. if verified:
  324. while True:
  325. try:
  326. pom = await package.pom
  327. break
  328. except TooManyRequestsException:
  329. logger.info('Too many requests. Delaying next attempt')
  330. await asyncio.sleep(3*random.random() + 0.2)
  331. else:
  332. logger.error(f'{package}: Verification failed after 50 tries')
  333. exit(1)
  334. if not pom:
  335. logger.warn(f'{package}: No pom')
  336. return
  337. pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}'
  338. pom_path = pom_dir / 'pom.xml'
  339. pom_dir.mkdir(exist_ok=True)
  340. pom.write(pom_path)
  341. logger.info(f'{package}: Downloaded')
  342. if not pom.is_bom:
  343. for dep in pom.dependency_management:
  344. logger.info(f'{package}: Handling transitive dependency {dep}')
  345. await queue.put(dep)
  346. async with done_lock:
  347. logger.debug(f'{package}: Marking done')
  348. p = copy.copy(package)
  349. p.version = None
  350. done.add(str(package))
  351. done.add(str(p))
  352. async with in_progress_lock:
  353. if str(package) in in_progress:
  354. in_progress.remove(str(package))
  355. else:
  356. p = copy.copy(package)
  357. p.version = None
  358. if str(p) in in_progress:
  359. in_progress.remove(str(p))
  360. else:
  361. logger.warning(f'{package}: Package is done, but not marked as in progress')
  362. async def worker(queue: asyncio.Queue) -> None:
  363. while True:
  364. package = await queue.get()
  365. while True:
  366. try:
  367. await download(package, queue)
  368. break
  369. except WaitForPackage as e:
  370. logger.info(f'{package}: Waiting for {e.package}')
  371. await queue.put(e.package)
  372. await queue.put(package)
  373. break
  374. except PackageError:
  375. logger.exception(f'{package}: Error while processing package')
  376. break
  377. except Exception:
  378. logger.exception(f'{package}: Unknown error while processing package')
  379. logger.error(global_properties)
  380. break
  381. queue.task_done()
  382. async def main() -> None:
  383. queue: asyncio.Queue = asyncio.Queue()
  384. tasks = []
  385. load_package_list(Path('package-list.txt'), queue)
  386. logger.debug(f'Starting {num_workers} workers')
  387. for i in range(num_workers):
  388. tasks.append(
  389. asyncio.create_task(
  390. worker(queue)
  391. )
  392. )
  393. await queue.join()
  394. logger.debug('Queue is empty. Cancelling workers')
  395. for task in tasks:
  396. task.cancel()
  397. await asyncio.gather(*tasks, return_exceptions=True)
  398. logger.info('Generating master POM')
  399. subprocess.call(['sh', 'generate_master_pom.sh'])
  400. logger = logging.getLogger(__name__)
  401. if __name__ == '__main__':
  402. parser = argparse.ArgumentParser()
  403. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  404. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  405. args = parser.parse_args()
  406. if args.verbosity == 0:
  407. log_level = 'WARNING'
  408. elif args.verbosity == 1:
  409. log_level = 'INFO'
  410. else:
  411. log_level = 'DEBUG'
  412. logging.basicConfig(level=log_level)
  413. num_workers = args.workers
  414. asyncio.run(main())