Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

505 linhas
17KB

  1. #!/bin/python3
  2. import re
  3. import copy
  4. import random
  5. import argparse
  6. import logging
  7. import asyncio
  8. import subprocess
  9. import copy
  10. import aiohttp
  11. from pathlib import Path
  12. from xml.etree import ElementTree as ET
  13. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  14. ET.register_namespace('', ns[''])
  15. baseurl = 'https://search.maven.org'
  16. base_pom_path = Path('poms')
  17. mirrors = [
  18. "https://repo.maven.apache.org/maven2",
  19. "https://repo1.maven.org/maven2",
  20. "https://oss.sonatype.org/content/repositories/snapshots",
  21. "https://packages.confluent.io/maven",
  22. "https://registry.quarkus.io/maven",
  23. "https://plugins.gradle.org/m2",
  24. ]
  25. done: set[str] = set()
  26. done_lock = asyncio.Lock()
  27. in_progress: set[str] = set()
  28. in_progress_lock = asyncio.Lock()
  29. num_workers = 50
  30. global_properties: dict[str, dict[str, str]] = {}
  31. class TooManyRequestsException(Exception):
  32. pass
  33. class PackageError(Exception):
  34. pass
  35. class WaitForPackage(Exception):
  36. def __init__(self, package):
  37. self.package = package
  38. class PackagePOM:
  39. def __init__(self, package: 'Package', pom: str):
  40. self._package = package
  41. logger.debug(f'{package}: Parsing POM')
  42. self.raw_root = ET.fromstring(pom)
  43. self.parent: Package | None = None
  44. if (parent_tag := self.raw_root.find('parent', ns)) is not None:
  45. parent_group_tag = parent_tag.find('groupId', ns)
  46. parent_artifact_tag = parent_tag.find('artifactId', ns)
  47. parent_version_tag = parent_tag.find('version', ns)
  48. parent_group = parent_group_tag.text if parent_group_tag is not None else None
  49. parent_artifact = parent_artifact_tag.text if parent_artifact_tag is not None else None
  50. parent_version = parent_version_tag.text if parent_version_tag is not None else None
  51. logger.debug(f'{package}: Parsing parent {parent_group}:{parent_artifact}:{parent_version}')
  52. if parent_group is not None and parent_artifact is not None and parent_version is not None:
  53. parent = Package(
  54. parent_group,
  55. parent_artifact,
  56. parent_version,
  57. )
  58. if str(parent) in done:
  59. self.parent = parent
  60. else:
  61. raise WaitForPackage(parent)
  62. else:
  63. raise PackageError(f'Invalid parent {parent_group}:{parent_artifact}:{parent_version}')
  64. logger.debug(f'{package}: Parsing properties')
  65. parent_props: dict[str, str] = {} if self.parent is None else global_properties[str(self.parent)]
  66. self.properties = self.resolve_props(parent_props)
  67. global_properties[str(package)] = self.properties
  68. logger.debug(f'{package}: Parsing packaging')
  69. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  70. self.packaging = packaging.text
  71. else:
  72. self.packaging = '??'
  73. self.is_bom = self.packaging == 'pom'
  74. if self.packaging == 'pom':
  75. root_copy = copy.deepcopy(self.raw_root)
  76. depman = root_copy.find('dependencyManagement', ns)
  77. if depman is not None:
  78. root_copy.extend(depman.findall('*'))
  79. root_copy.remove(depman)
  80. tmpGroupId = f'tmp.{package.groupId}'
  81. tmpArtifactId = f'placeholder.{package.artifactId}'
  82. tmpVersion = package.version
  83. if (groupId := root_copy.find('groupId', ns)) is not None:
  84. groupId.text = tmpGroupId
  85. else:
  86. logger.info(f"{package}: Inserting new groupId tag in pom")
  87. ET.SubElement(root_copy, 'groupId').text = tmpGroupId
  88. if (artifactId := root_copy.find('artifactId', ns)) is not None:
  89. artifactId.text = tmpArtifactId
  90. else:
  91. logger.info(f"{package}: Inserting new artifactId tag in pom")
  92. ET.SubElement(root_copy, 'artifactId').text = tmpArtifactId
  93. if (version := root_copy.find('version', ns)) is not None:
  94. version.text = tmpVersion
  95. else:
  96. logger.info(f"{package}: Inserting new version tag in pom")
  97. ET.SubElement(root_copy, 'version').text = tmpVersion
  98. # Add a dependency for the pom itself
  99. dependencies = root_copy.find('dependencies', ns) or ET.SubElement(root_copy, 'dependencies')
  100. self_dep = ET.SubElement(dependencies, 'dependency')
  101. ET.SubElement(self_dep, 'groupId').text = package.groupId
  102. ET.SubElement(self_dep, 'artifactId').text = package.artifactId
  103. ET.SubElement(self_dep, 'version').text = package.version
  104. self.generated_root = root_copy
  105. else:
  106. self.generated_root = ET.fromstring(
  107. f"""
  108. <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
  109. https://maven.apache.org/xsd/maven-4.0.0.xsd"
  110. xmlns="http://maven.apache.org/POM/4.0.0"
  111. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  112. <modelVersion>4.0.0</modelVersion>
  113. <groupId>tmp.{package.groupId}</groupId>
  114. <artifactId>placeholder-{package.artifactId}</artifactId>
  115. <version>{package.version}</version>
  116. <name>Package {package.artifactId}</name>
  117. <dependencies>
  118. <dependency>
  119. <groupId>{package.groupId}</groupId>
  120. <artifactId>{package.artifactId}</artifactId>
  121. <version>{package.version}</version>
  122. </dependency>
  123. </dependencies>
  124. </project>
  125. """
  126. )
  127. logger.debug(f'{package}: POM parsed')
  128. def write(self, f):
  129. tree = ET.ElementTree(self.generated_root)
  130. ET.indent(tree)
  131. tree.write(f)
  132. def resolve_props(self, initial: dict[str, str]):
  133. props = initial
  134. for prop_tag in self.raw_root.findall('.//properties/*', ns):
  135. prop = prop_tag.tag.replace(f'{{{ns[""]}}}', '')
  136. value = prop_tag.text if prop_tag.text is not None else ''
  137. logger.debug(f'{self._package}: Setting prop {prop}={value}')
  138. props[prop] = value
  139. changed = True
  140. while changed:
  141. changed = False
  142. for prop, value in props.items():
  143. new_value = self.prop_replace(value, props)
  144. if new_value != value:
  145. changed = True
  146. logger.debug(f'{self._package}: Setting prop {prop}={new_value}')
  147. props[prop] = new_value
  148. return props
  149. def prop_replace(self, text, props: dict[str, str] | None = None) -> str:
  150. def lookup_prop(match) -> str:
  151. prop = match.group(1)
  152. if prop == 'project.groupId':
  153. value = str(self._package.groupId)
  154. elif prop == 'project.artifactId':
  155. value = str(self._package.artifactId)
  156. elif prop == 'project.version':
  157. value = str(self._package.version)
  158. elif prop.startswith('project.build') or prop.startswith('env.') or prop.startswith('maven.'):
  159. value = ''
  160. else:
  161. try:
  162. value = props[prop] if props is not None else self.properties[prop]
  163. except KeyError:
  164. logger.error(f'{self._package}: Could not find property {prop}. Setting it to ""')
  165. value = ''
  166. logger.debug(f'{self._package}: Replacing property {prop} with {value}')
  167. return value
  168. return re.sub(
  169. r'\$\{([^\}]*)\}',
  170. lookup_prop,
  171. text,
  172. )
  173. def _package_from_xml_dep(self, dep: ET.Element) -> 'Package':
  174. def prop_replace_tag(tag) -> str:
  175. return self.prop_replace(
  176. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  177. )
  178. return Package(
  179. groupId=prop_replace_tag('groupId'),
  180. artifactId=prop_replace_tag('artifactId'),
  181. version=prop_replace_tag('version'),
  182. )
  183. @property
  184. def dependency_management(self) -> list['Package']:
  185. dependencies: list[Package] = []
  186. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  187. package = self._package_from_xml_dep(dep)
  188. dependencies.append(package)
  189. return dependencies
  190. class Package:
  191. _pom: PackagePOM | None = None
  192. _verified: bool = False
  193. def __init__(self, groupId: str, artifactId: str, version: str | None = None, implicit: bool = False):
  194. self.groupId = groupId
  195. self.artifactId = artifactId
  196. self.version = version
  197. self.implicit = implicit
  198. def __str__(self) -> str:
  199. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  200. def __eq__(self, other) -> bool:
  201. return (
  202. self.groupId == other.groupId
  203. and self.artifactId == other.artifactId
  204. and self.version == other.version
  205. )
  206. def __hash__(self) -> int:
  207. return hash((self.groupId, self.artifactId, self.version))
  208. @property
  209. def dir_path(self):
  210. group_path = self.groupId.replace(".", "/")
  211. return f'{group_path}/{self.artifactId}/{self.version}'
  212. @property
  213. def base_filename(self):
  214. return f'{self.artifactId}-{self.version}'
  215. async def download_file(self, extension):
  216. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  217. async with aiohttp.ClientSession() as session:
  218. for mirror in mirrors:
  219. pom_url = f'{mirror}/{filepath}'
  220. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  221. async with session.get(pom_url) as response:
  222. if response.status == 200:
  223. logger.debug(f'{self}: {extension} downloaded')
  224. return await response.text()
  225. break
  226. elif response.status == 429:
  227. raise TooManyRequestsException()
  228. else:
  229. logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}')
  230. else:
  231. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  232. return None
  233. @property
  234. async def pom(self) -> PackagePOM:
  235. if self._pom is not None:
  236. return self._pom
  237. if self.version is None:
  238. await self._query_maven()
  239. self._pom = PackagePOM(self, await self.download_file('pom'))
  240. return self._pom
  241. @property
  242. def _urlquery(self) -> str:
  243. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  244. if self.version is not None:
  245. q += f'+AND+v:{self.version}'
  246. return q
  247. async def _query_maven(self) -> None:
  248. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  249. logger.debug(f'{self}: Querying maven at url {url}')
  250. async with aiohttp.ClientSession() as session:
  251. async with session.get(url) as response:
  252. if response.status == 200:
  253. message = await response.json()
  254. num = message['response']['numFound']
  255. if num:
  256. logger.debug(f'{self}: Query successful')
  257. self._verified = True
  258. if self.version is None:
  259. version = message['response']['docs'][0]['latestVersion']
  260. logger.debug(f'{self}: Using newest version {version}')
  261. self.version = version
  262. else:
  263. if self.implicit:
  264. logger.debug(f'{self}: No matching packages found')
  265. else:
  266. logger.warning(f'{self}: No matching packages found')
  267. self._verified = False
  268. elif response.status == 429:
  269. raise TooManyRequestsException()
  270. else:
  271. self._verified = False
  272. logger.warning(f'{self}: HTTP error {response.status} downloading pom')
  273. async def verify(self) -> bool:
  274. if not self._verified:
  275. await self._query_maven()
  276. return self._verified
  277. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  278. logger.info(f'Parsing {list_path}')
  279. with list_path.open('r') as f:
  280. for line in f.readlines():
  281. sections = line.strip().split(':')
  282. if len(sections) < 2 or len(sections) > 3:
  283. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  284. continue
  285. package = Package(
  286. sections[0],
  287. sections[1],
  288. sections[2] if len(sections) == 3 else None,
  289. )
  290. queue.put_nowait(package)
  291. if not package.artifactId.endswith('-jvm'):
  292. queue.put_nowait(
  293. Package(
  294. package.groupId,
  295. f'{package.artifactId}-jvm',
  296. package.version,
  297. True,
  298. )
  299. )
  300. async def download(package: Package, queue: asyncio.Queue) -> None:
  301. async with done_lock:
  302. skip = str(package) in done
  303. async with in_progress_lock:
  304. skip = skip or (str(package) in in_progress)
  305. if skip:
  306. logger.info(f'{package}: Already downloaded. Skipping.')
  307. else:
  308. async with in_progress_lock:
  309. in_progress.add(str(package))
  310. if await package.verify():
  311. pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}'
  312. pom_path = pom_dir / 'pom.xml'
  313. pom_dir.mkdir(exist_ok=True)
  314. pom = await package.pom
  315. if not pom:
  316. return
  317. pom.write(pom_path)
  318. logger.info(f'{package}: Downloaded')
  319. if not pom.is_bom:
  320. for dep in pom.dependency_management:
  321. logger.info(f'{package}: Handling transitive dependency {dep}')
  322. await queue.put(dep)
  323. async with done_lock:
  324. logger.debug(f'{package}: Marking done')
  325. p = copy.copy(package)
  326. p.version = None
  327. done.add(str(package))
  328. done.add(str(p))
  329. async with in_progress_lock:
  330. if str(package) in in_progress:
  331. in_progress.remove(str(package))
  332. else:
  333. p = copy.copy(package)
  334. p.version = None
  335. if str(p) in in_progress:
  336. in_progress.remove(str(p))
  337. else:
  338. logger.warning(f'{package}: Package is done, but not marked as in progress')
  339. async def worker(queue: asyncio.Queue) -> None:
  340. while True:
  341. package = await queue.get()
  342. while True:
  343. try:
  344. await download(package, queue)
  345. break
  346. except TooManyRequestsException:
  347. logger.info('Too many requests. Delaying next attempt')
  348. await asyncio.sleep(3*random.random() + 0.2)
  349. except WaitForPackage as e:
  350. logger.info(f'{package}: Waiting for {e.package}')
  351. await queue.put(e.package)
  352. await queue.put(package)
  353. break
  354. except PackageError:
  355. logger.exception(f'{package}: Error while processing package')
  356. break
  357. except Exception:
  358. logger.exception(f'{package}: Unknown error while processing package')
  359. logger.error(global_properties)
  360. break
  361. queue.task_done()
  362. async def main() -> None:
  363. queue: asyncio.Queue = asyncio.Queue()
  364. tasks = []
  365. load_package_list(Path('package-list.txt'), queue)
  366. logger.debug(f'Starting {num_workers} workers')
  367. for i in range(num_workers):
  368. tasks.append(
  369. asyncio.create_task(
  370. worker(queue)
  371. )
  372. )
  373. await queue.join()
  374. logger.debug('Queue is empty. Cancelling workers')
  375. for task in tasks:
  376. task.cancel()
  377. await asyncio.gather(*tasks, return_exceptions=True)
  378. logger.info('Generating master POM')
  379. subprocess.call(['sh', 'generate_master_pom.sh'])
  380. logger = logging.getLogger(__name__)
  381. if __name__ == '__main__':
  382. parser = argparse.ArgumentParser()
  383. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  384. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  385. args = parser.parse_args()
  386. if args.verbosity == 0:
  387. log_level = 'WARNING'
  388. elif args.verbosity == 1:
  389. log_level = 'INFO'
  390. else:
  391. log_level = 'DEBUG'
  392. logging.basicConfig(level=log_level)
  393. num_workers = args.workers
  394. asyncio.run(main())