Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

740 linhas
24KB

  1. #!/bin/python3
  2. import re
  3. import argparse
  4. import logging
  5. import asyncio
  6. from typing import Union, Optional
  7. from enum import Enum, auto
  8. from aiohttp import ClientSession
  9. from pathlib import Path
  10. from xml.etree import ElementTree as ET
  11. logger = logging.getLogger(__name__)
  12. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  13. ET.register_namespace('', ns[''])
  14. output_dir = Path('output')
  15. baseurl = 'https://search.maven.org'
  16. base_pom_path = Path('poms')
  17. mirrors = [
  18. "https://repo.maven.apache.org/maven2",
  19. "https://repo1.maven.org/maven2",
  20. "https://oss.sonatype.org/content/repositories/snapshots",
  21. "https://packages.confluent.io/maven",
  22. "https://registry.quarkus.io/maven",
  23. "https://plugins.gradle.org/m2",
  24. ]
  25. java_version = 11
  26. num_workers = 50
  27. queue: 'PackageQueue'
  28. session: ClientSession
  29. class RequiresPackage(Exception):
  30. def __init__(self, package: 'Package'):
  31. super().__init__(f'Requires package {package}')
  32. self.package = package
  33. class PackageState(Enum):
  34. PENDING = auto()
  35. DONE = auto()
  36. class PackageQueue:
  37. def __init__(self):
  38. self.lock = asyncio.Lock()
  39. self.queue = asyncio.Queue()
  40. self.packages: dict[str, tuple['Package', 'PackageState']] = {}
  41. async def put(self, package: 'Package', timeout: int | None = None) -> None:
  42. key = str(package)
  43. async with self.lock:
  44. if key not in self.packages:
  45. logger.debug(f'{key}: Added to list')
  46. self.packages[key] = (package, PackageState.PENDING)
  47. else:
  48. return
  49. await asyncio.wait_for(self.queue.put(package), timeout)
  50. async def requeue(self, package: 'Package', timeout: int | None = None) -> None:
  51. key = str(package)
  52. async with self.lock:
  53. exists = key in self.packages
  54. if exists:
  55. await asyncio.wait_for(self.queue.put(package), timeout)
  56. self.queue.task_done()
  57. async def get(self, timeout: int | None = None) -> 'Package':
  58. while True:
  59. package = await asyncio.wait_for(self.queue.get(), timeout)
  60. key = str(package)
  61. async with self.lock:
  62. if key in self.packages:
  63. state = self.packages[key][1]
  64. match state:
  65. case PackageState.DONE:
  66. logger.debug(f'{package}: Already downloaded. Skipping.')
  67. case PackageState.PENDING:
  68. return package
  69. case _:
  70. logger.warning(f'{package}: Unknown state {state}')
  71. else:
  72. logger.warning(f'{package}: Package is in queue but not in package list')
  73. async def get_done(self, package: Union['Package', str]) -> Optional['Package']:
  74. key = str(package)
  75. async with self.lock:
  76. if key in self.packages:
  77. pack, state = self.packages[key]
  78. if state == PackageState.DONE:
  79. return pack
  80. return None
  81. async def done(self, package: 'Package') -> None:
  82. key = str(package)
  83. async with self.lock:
  84. if key not in self.packages:
  85. logger.warning(f'{package}: Unknown package marked as done')
  86. self.packages[key] = (package, PackageState.DONE)
  87. self.queue.task_done()
  88. async def set_version(self, package):
  89. key = str(package)
  90. if key.count(':') == 2:
  91. a, b, c = key.split(':')
  92. x = ':'.join([a, b, '----'])
  93. else:
  94. raise RuntimeError(f'Malformed package specifier {package}')
  95. async def join(self) -> None:
  96. await self.queue.join()
  97. def empty(self) -> bool:
  98. return self.queue.empty()
  99. class MavenVersion:
  100. qualifiers = (
  101. ('alpha', 'a'),
  102. ('beta', 'b'),
  103. ('milestone', 'mc'),
  104. ('rc'),
  105. ('sp'),
  106. ('ga'),
  107. ('final'),
  108. )
  109. def __init__(self, version: str):
  110. self.raw = version
  111. self.parts = [x for x in re.split(r'[\.-]|((?<=\d)(?=\w))', version) if x]
  112. def __str__(self) -> str:
  113. return self.raw
  114. def _compare(self, other):
  115. match other:
  116. case MavenVersion():
  117. pass
  118. case str():
  119. other = MavenVersion(other)
  120. case _:
  121. return False
  122. for x, y in zip(self.parts, other.parts):
  123. if x == y:
  124. return 0
  125. elif x.isnumeric():
  126. if y.isnumeric():
  127. if int(x) > int(y):
  128. return 1
  129. elif int(x) < int(y):
  130. return -1
  131. else:
  132. return 1
  133. elif y.isnumeric():
  134. return -1
  135. else:
  136. def qualifier_index(qualifier: str) -> int | None:
  137. for i, q in enumerate(self.qualifiers):
  138. for alias in q:
  139. if alias == qualifier:
  140. return i
  141. return None
  142. xi = qualifier_index(x)
  143. yi = qualifier_index(y)
  144. if xi is not None and yi is not None:
  145. if xi < yi:
  146. return 1
  147. elif xi > yi:
  148. return -1
  149. raise RuntimeError(f"Can't compare qualifier {x} and {y} from version {self} and {other}")
  150. return 0
  151. def __eq__(self, other) -> bool:
  152. try:
  153. return self._compare(other) == 0
  154. except RuntimeError:
  155. return False
  156. def __ne__(self, other) -> bool:
  157. try:
  158. return self._compare(other) != 0
  159. except RuntimeError:
  160. return True
  161. def __gt__(self, other) -> bool:
  162. return self._compare(other) == 1
  163. def __ge__(self, other) -> bool:
  164. return self._compare(other) in [0, 1]
  165. def __lt__(self, other) -> bool:
  166. return self._compare(other) == -1
  167. def __le__(self, other) -> bool:
  168. return self._compare(other) in [-1, 0]
  169. class MavenVersionRange:
  170. def __init__(self, range_str: str):
  171. self.range_str = range_str
  172. def is_version_in_range(self, version: str) -> bool:
  173. is_match = False
  174. for _, low_bracket, content, high_bracket in re.findall(r'((\(|\[)([^\(\[\]\)]+)(\]|\)))', self.range_str):
  175. bounds = content.split(',')
  176. if len(bounds) == 2:
  177. low_bound, high_bound = [
  178. MavenVersion(bound) if bound else None
  179. for bound in bounds
  180. ]
  181. elif len(bounds) == 1:
  182. low_bound = high_bound = MavenVersion(bounds[0]) if bounds[0] else None
  183. else:
  184. raise RuntimeError(f'Invalid version range {self.range_str}')
  185. match low_bracket:
  186. case '(':
  187. if low_bound is not None:
  188. if low_bound >= self:
  189. continue
  190. case '[':
  191. if low_bound is not None:
  192. if low_bound > self:
  193. continue
  194. else:
  195. raise RuntimeError(f"Error in version range {self.range_str}. [ can't be unbounded")
  196. match high_bracket:
  197. case ')':
  198. if high_bound is not None:
  199. if high_bound <= self:
  200. continue
  201. case ']':
  202. if high_bound is not None:
  203. if high_bound > self:
  204. continue
  205. else:
  206. raise RuntimeError(f"Error in version range {self.range_str}. ] can't be unbounded")
  207. is_match = True
  208. return is_match
  209. def get_maven_version_or_range(version_or_range: str) -> MavenVersion | MavenVersionRange:
  210. if re.fullmatch(r'([^\(\[\]\)]+)', version_or_range):
  211. return MavenVersion(version_or_range)
  212. else:
  213. return MavenVersionRange(version_or_range)
  214. class PackagePOM:
  215. def __init__(self, package: 'Package', pom: str):
  216. logger.debug(f'{package}: Parsing POM')
  217. self.package = package
  218. self.raw_root = ET.fromstring(pom)
  219. async def parse_pom(self):
  220. parent_tag = self.raw_root.find('parent', ns)
  221. parent = await self._package_from_xml(parent_tag) if parent_tag else None
  222. if parent is not None:
  223. if (parent := await queue.get_done(parent)) is not None:
  224. logger.debug(f'{self.package}: Using parent {parent}')
  225. self.parent = parent
  226. else:
  227. logger.debug(f'{self.package}: Requires parent {parent}')
  228. raise RequiresPackage(parent)
  229. else:
  230. self.parent = None
  231. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  232. self.packaging = await self._format_with_props(packaging.text)
  233. else:
  234. self.packaging = 'jar'
  235. self.is_bom = self.packaging == 'pom'
  236. logger.debug(f'{self.package}: POM parsed')
  237. async def get_property(self, prop: str) -> str | None:
  238. match prop:
  239. case 'project.groupId' | 'pom.groupId':
  240. return self.package.groupId
  241. case 'project.artifactId' | 'pom.artifactId':
  242. return self.package.artifactId
  243. case 'project.version' | 'pom.version' | 'version':
  244. return str(self.package.version)
  245. case 'java.version':
  246. return str(java_version)
  247. case _:
  248. elem = self.raw_root.find(f'.//properties/{prop}', ns)
  249. if elem is not None:
  250. return await self._format_with_props(elem.text)
  251. elif self.parent:
  252. parent_pom = self.parent.pom
  253. if parent_pom is not None:
  254. parent_prop = prop.replace('parent.', '')
  255. if prop == parent_prop:
  256. logger.debug(f'{self.package}: Passing property {prop} to parent as {parent_prop}')
  257. else:
  258. logger.debug(f'{self.package}: Passing property {prop} to parent')
  259. return await parent_pom.get_property(parent_prop)
  260. else:
  261. logger.warning(f'{self.package}: Parent {self.parent} does not have a pom file')
  262. import pdb; pdb.set_trace()
  263. return None
  264. else:
  265. return None
  266. async def _format_with_props(self, text):
  267. arr = re.split(r'\$\{([^\}]*)\}', text)
  268. for i in range(1, len(arr), 2):
  269. prop = arr[i]
  270. value = await self.get_property(prop)
  271. if value is None:
  272. logger.warning(f'{self.package}: Property {prop} not found. Defaulting to an empty string')
  273. value = ''
  274. arr[i] = value
  275. logger.debug(f'{self.package}: Replacing property "{prop}" with "{value}"')
  276. return ''.join(arr)
  277. async def _package_from_xml(self, dep: ET.Element):
  278. package = Package(
  279. *[
  280. await self._format_with_props(
  281. elem.text or '' if (elem := dep.find(tag, ns)) is not None else ''
  282. )
  283. for tag in [
  284. 'groupId',
  285. 'artifactId',
  286. 'version',
  287. ]
  288. ]
  289. )
  290. return package
  291. @property
  292. async def dependency_management(self) -> list['Package']:
  293. dependencies: list[Package] = []
  294. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  295. package = await self._package_from_xml(dep)
  296. logger.debug(f'{self.package}: Adding dependency management {package}')
  297. dependencies.append(package)
  298. return dependencies
  299. @property
  300. async def dependencies(self) -> list['Package']:
  301. dependencies: list[Package] = []
  302. for dep in self.raw_root.find('dependencies', ns) or []:
  303. package = await self._package_from_xml(dep)
  304. logger.debug(f'{self.package}: Adding dependency {package}')
  305. dependencies.append(package)
  306. return dependencies
  307. class Package:
  308. _verified: bool = False
  309. pom: PackagePOM | None = None
  310. groupId: str
  311. artifactId: str
  312. version: MavenVersion | None
  313. version_range: MavenVersionRange | None
  314. def __init__(self, groupId: str, artifactId: str, version: str = ''):
  315. self.groupId = groupId.strip()
  316. self.artifactId = artifactId.strip()
  317. match (val := get_maven_version_or_range(version.strip())):
  318. case MavenVersion():
  319. self.version = val
  320. self.version_range = None
  321. case MavenVersionRange():
  322. self.version = None
  323. self.version_range = val
  324. if not self.groupId or not self.artifactId:
  325. logger.warning(f'{self}: groupId or artifactId is empty')
  326. def __str__(self) -> str:
  327. return f'{self.groupId or "----"}:{self.artifactId or "----"}:{self.version or "----"}'
  328. def __eq__(self, other) -> bool:
  329. return (
  330. self.groupId == other.groupId
  331. and self.artifactId == other.artifactId
  332. and self.version == other.version
  333. )
  334. def __hash__(self) -> int:
  335. return hash((self.groupId, self.artifactId, self.version))
  336. @property
  337. def package_dir_path(self):
  338. group_path = self.groupId.replace(".", "/")
  339. return f'{group_path}/{self.artifactId}'
  340. @property
  341. def version_dir_path(self):
  342. return self.package_dir_path + f'/{self.version}'
  343. @property
  344. def base_filename(self):
  345. return f'{self.artifactId}-{self.version}'
  346. async def download_file_bytes(self, extension) -> bytes | None:
  347. filepath = f'{self.version_dir_path}/{self.base_filename}.{extension}'
  348. for mirror in mirrors:
  349. pom_url = f'{mirror}/{filepath}'
  350. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  351. async with session.get(pom_url) as response:
  352. if response.status == 200:
  353. logger.debug(f'{self}: {extension} downloaded')
  354. return await response.read()
  355. else:
  356. logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}')
  357. else:
  358. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  359. return None
  360. async def download_file_text(self, extension: str) -> str | None:
  361. b = await self.download_file_bytes(extension)
  362. if b is not None:
  363. return b.decode('utf-8')
  364. else:
  365. return None
  366. async def download_all(self, out_dir: Path) -> bool:
  367. basedir = out_dir / self.version_dir_path
  368. basedir.mkdir(exist_ok=True, parents=True)
  369. basepath = basedir / self.base_filename
  370. pom = self.pom
  371. if not pom:
  372. return False
  373. match pom.packaging:
  374. case 'pom':
  375. return True
  376. case 'jar' | 'maven-plugin' | 'eclipse-plugin' | 'bundle':
  377. logger.debug(f'{self}: Downloading JAR')
  378. return True # TODO: Remove to test JAR download
  379. # TODO: Handle checksums
  380. jar = await self.download_file_bytes('jar')
  381. if jar:
  382. with basepath.with_suffix('.jar').open('wb') as f:
  383. f.write(jar)
  384. return True
  385. else:
  386. logger.warning(f'{self}: JAR not found')
  387. return False
  388. case _:
  389. logger.warning(f'{self}: Unknown packaging {pom.packaging}')
  390. return False
  391. return False
  392. async def fetch_pom(self) -> None:
  393. if self.pom is not None:
  394. await self.pom.parse_pom()
  395. return
  396. if self.version is None:
  397. await self._query_maven()
  398. xml = await self.download_file_text('pom')
  399. if xml is not None:
  400. self.pom = PackagePOM(self, xml)
  401. await self.pom.parse_pom()
  402. @property
  403. def _urlquery(self) -> str:
  404. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  405. if self.version is not None:
  406. q += f'+AND+v:{self.version}'
  407. return q
  408. async def _query_maven(self) -> None:
  409. for mirror in mirrors:
  410. url = f'{mirror}/{self.package_dir_path}/maven-metadata.xml'
  411. logger.debug(f'{self}: Querying maven metadata at url {url}')
  412. async with session.get(url) as response:
  413. if response.status == 200:
  414. message = await response.text()
  415. xml = ET.fromstring(message)
  416. version_tags = xml.findall('./versioning/versions/version')
  417. versions = [MavenVersion(version.text) for version in version_tags if version.text is not None]
  418. logger.debug(f'{self}: Query successful')
  419. if not self.version:
  420. elem = xml.find('./versioning/latest')
  421. if elem is not None and (text := elem.text):
  422. self._verified = True
  423. self.version = MavenVersion(text)
  424. logger.debug(f'{self}: Using newest version {self.version}')
  425. break
  426. else:
  427. logger.debug(f'{self}: No latest version marked in metadata')
  428. if not versions:
  429. logger.warning(f'{self}: No versions available in metadata')
  430. else:
  431. self._verified = True
  432. self.version = max(versions)
  433. logger.debug(f'{self}: Using version {self.version}')
  434. break
  435. elif self.version is not None:
  436. if self.version in versions:
  437. self._verified = True
  438. break
  439. elif self.version_range is not None:
  440. valid_versions = [v for v in versions if self.version_range.matches(v)]
  441. if valid_versions:
  442. self.version = max(valid_versions)
  443. self._verified = True
  444. logger.debug(f'{self}: Picked version {self.version} from range {self.version_range}')
  445. break
  446. else:
  447. logger.debug(f"{self}: Available versions '{versions}' doesn't match range {self.version_range}")
  448. continue
  449. else:
  450. raise RuntimeError(f'{self}: Unknown version type {self.version}')
  451. else:
  452. self._verified = False
  453. logger.debug(f'{self}: Package not found in mirror')
  454. else:
  455. logger.warning(f'{self}: Package not found')
  456. async def verify(self) -> bool:
  457. if not self._verified:
  458. await self._query_maven()
  459. return self._verified
  460. async def load_package_list(list_path: Path) -> None:
  461. logger.info(f'Parsing {list_path}')
  462. with list_path.open('r') as f:
  463. for line in f.readlines():
  464. sections = line.strip().split(':')
  465. if len(sections) < 2 or len(sections) > 3:
  466. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  467. continue
  468. package = Package(
  469. sections[0],
  470. sections[1],
  471. sections[2] if len(sections) == 3 else '',
  472. )
  473. await package.verify()
  474. await queue.put(package, timeout=10)
  475. async def download(package: Package) -> None:
  476. if await package.verify():
  477. await package.fetch_pom()
  478. success = await package.download_all(output_dir)
  479. if success:
  480. logger.info(f'{package}: Downloaded')
  481. else:
  482. logger.warning(f'{package}: Download failed')
  483. return
  484. pom = package.pom
  485. if pom is not None:
  486. deps = await pom.dependencies
  487. depman = await pom.dependency_management
  488. for dep in [*deps, *depman]:
  489. await queue.put(dep)
  490. else:
  491. logger.warning(f'{package}: POM not found. Skipping')
  492. else:
  493. logger.warning(f"{package}: Can't verify package")
  494. async def file_parse_worker(id: str) -> None:
  495. while True:
  496. try:
  497. package = await queue.get(timeout=5)
  498. except asyncio.TimeoutError:
  499. logger.error(f'Worker {id} timed out waiting for queue. Stopping worker')
  500. break
  501. try:
  502. await asyncio.wait_for(package.fetch_pom(), timeout=60)
  503. except asyncio.TimeoutError as e:
  504. logger.error(f'{package}: Timeout out waiting for download', exc_info=e)
  505. except RequiresPackage as e:
  506. await queue.put(e.package)
  507. await queue.requeue(package)
  508. except Exception as e:
  509. logger.error(f'{package}:', exc_info=e)
  510. finally:
  511. await queue.done(package)
  512. async def download_worker(id: str) -> None:
  513. while True:
  514. try:
  515. package = await queue.get(timeout=5)
  516. except asyncio.TimeoutError:
  517. if queue.empty():
  518. logger.error(f'Worker {id} timed out waiting for empty queue. Stopping worker')
  519. else:
  520. logger.error(f'Worker {id} timed out waiting for queue lock. Stopping worker')
  521. break
  522. try:
  523. if package:
  524. await asyncio.wait_for(download(package), timeout=60)
  525. except asyncio.TimeoutError as e:
  526. logger.error(f'{package}: Timeout out waiting for download', exc_info=e)
  527. except RequiresPackage as e:
  528. await queue.put(e.package)
  529. await queue.requeue(package)
  530. except Exception as e:
  531. logger.error(f'{package}:', exc_info=e)
  532. finally:
  533. await queue.done(package)
  534. async def parse_list_tasks() -> None:
  535. #tasks = []
  536. #logger.debug(f'Starting {num_workers} download workers')
  537. #for i in range(num_workers):
  538. # tasks.append(
  539. # asyncio.create_task(
  540. # file_parse_worker(str(i))
  541. # )
  542. # )
  543. await load_package_list(Path('package-list.txt'))
  544. #exceptions = await asyncio.gather(*tasks, return_exceptions=True)
  545. #for e in exceptions:
  546. #logger.debug('Worker exception', exc_info=e)
  547. async def download_tasks() -> None:
  548. tasks = []
  549. logger.debug(f'Starting {num_workers} download workers')
  550. for i in range(num_workers):
  551. tasks.append(
  552. asyncio.create_task(
  553. download_worker(str(i))
  554. )
  555. )
  556. try:
  557. await queue.join()
  558. logger.debug('Queue is empty. Cancelling workers')
  559. except asyncio.CancelledError:
  560. logger.info('Tasks cancelled')
  561. for task in tasks:
  562. task.cancel()
  563. exceptions = await asyncio.gather(*tasks, return_exceptions=True)
  564. for e in exceptions:
  565. logger.debug(f'Worker exception {e}', exc_info=e)
  566. async def main() -> None:
  567. global queue
  568. global session
  569. queue = PackageQueue()
  570. session = ClientSession()
  571. try:
  572. await parse_list_tasks()
  573. await download_tasks()
  574. finally:
  575. await session.close()
  576. if __name__ == '__main__':
  577. parser = argparse.ArgumentParser()
  578. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  579. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  580. args = parser.parse_args()
  581. if args.verbosity == 0:
  582. log_level = 'WARNING'
  583. elif args.verbosity == 1:
  584. log_level = 'INFO'
  585. else:
  586. log_level = 'DEBUG'
  587. logging.basicConfig(level=log_level)
  588. num_workers = args.workers
  589. asyncio.run(main())