25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

329 lines
9.9KB

  1. #!/bin/python3
  2. import re
  3. import random
  4. import argparse
  5. import logging
  6. import asyncio
  7. import subprocess
  8. import copy
  9. import aiohttp
  10. from pathlib import Path
  11. from xml.etree import ElementTree as ET
  12. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  13. ET.register_namespace('', ns[''])
  14. baseurl = 'https://search.maven.org'
  15. output_path: Path = Path()
  16. mirrors = [
  17. "https://repo.maven.apache.org/maven2",
  18. "https://repo1.maven.org/maven2",
  19. "https://oss.sonatype.org/content/repositories/snapshots",
  20. "https://packages.confluent.io/maven",
  21. "https://registry.quarkus.io/maven",
  22. "https://plugins.gradle.org/m2",
  23. ]
  24. done: set[str] = set()
  25. done_lock = asyncio.Lock()
  26. num_workers = 50
  27. class PackagePOM:
  28. def __init__(self, package: 'Package', pom: str):
  29. logger.debug(f'{package}: Parsing POM')
  30. self._package = package
  31. self.raw_root = ET.fromstring(pom)
  32. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  33. self.packaging = packaging.text
  34. else:
  35. self.packaging = '??'
  36. self.is_bom = self.packaging == 'pom'
  37. if self.packaging == 'pom':
  38. self.packages = [package, *self.dependency_management]
  39. else:
  40. self.packages = [package]
  41. logger.debug(f'{package}: POM parsed')
  42. def get_property(self, prop: str):
  43. elem = self.raw_root.find(f'.//properties/{prop}', ns)
  44. if elem is not None:
  45. return elem.text
  46. else:
  47. return None
  48. def _package_from_xml_dep(self, dep: ET.Element) -> 'Package':
  49. def lookup_prop(match) -> str:
  50. prop = match.group(1)
  51. if prop == 'project.groupId':
  52. value = str(self._package.groupId)
  53. elif prop == 'project.artifactId':
  54. value = str(self._package.artifactId)
  55. elif prop == 'project.version':
  56. value = str(self._package.version)
  57. else:
  58. value = prop_replace(self.get_property(prop))
  59. logger.debug(f'{self._package}: Replacing property {prop} with {value}')
  60. return value
  61. def prop_replace(text) -> str:
  62. return re.sub(
  63. r'\$\{([^\}]*)\}',
  64. lookup_prop,
  65. text,
  66. )
  67. return Package(
  68. *[
  69. prop_replace(
  70. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  71. )
  72. for tag in [
  73. 'groupId',
  74. 'artifactId',
  75. 'version',
  76. ]
  77. ]
  78. )
  79. @property
  80. def dependency_management(self) -> list['Package']:
  81. dependencies: list[Package] = []
  82. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  83. package = self._package_from_xml_dep(dep)
  84. dependencies.append(package)
  85. return dependencies
  86. class Package:
  87. _pom: PackagePOM | None = None
  88. _verified: bool = False
  89. def __init__(self, groupId: str, artifactId: str, version: str | None = None, implicit: bool = False):
  90. self.groupId = groupId
  91. self.artifactId = artifactId
  92. self.version = version
  93. self.implicit = implicit
  94. def __str__(self) -> str:
  95. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  96. def __eq__(self, other) -> bool:
  97. return (
  98. self.groupId == other.groupId
  99. and self.artifactId == other.artifactId
  100. and self.version == other.version
  101. )
  102. def __hash__(self) -> int:
  103. return hash((self.groupId, self.artifactId, self.version))
  104. @property
  105. def dir_path(self):
  106. group_path = self.groupId.replace(".", "/")
  107. return f'{group_path}/{self.artifactId}/{self.version}'
  108. @property
  109. def base_filename(self):
  110. return f'{self.artifactId}-{self.version}'
  111. async def download_file(self, extension):
  112. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  113. async with aiohttp.ClientSession() as session:
  114. for mirror in mirrors:
  115. pom_url = f'{mirror}/{filepath}'
  116. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  117. async with session.get(pom_url) as response:
  118. if response.status == 200:
  119. logger.debug(f'{self}: {extension} downloaded')
  120. return await response.text()
  121. break
  122. elif response.status == 429:
  123. logger.error(f'{self}: HTTP error 429 (Too many requests). Retry after {response.headers["Retry-After"]}')
  124. else:
  125. logger.error(f'{self}: HTTP error {response.status} from mirror {mirror}')
  126. else:
  127. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  128. return None
  129. @property
  130. async def pom(self) -> PackagePOM:
  131. if self._pom is not None:
  132. return self._pom
  133. if self.version is None:
  134. await self._query_maven()
  135. self._pom = PackagePOM(self, await self.download_file('pom'))
  136. return self._pom
  137. @property
  138. def _urlquery(self) -> str:
  139. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  140. if self.version is not None:
  141. q += f'+AND+v:{self.version}'
  142. return q
  143. async def _query_maven(self) -> None:
  144. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  145. logger.debug(f'{self}: Querying maven at url {url}')
  146. async with aiohttp.ClientSession() as session:
  147. async with session.get(url) as response:
  148. if response.status == 200:
  149. message = await response.json()
  150. num = message['response']['numFound']
  151. if num:
  152. logger.debug(f'{self}: Query successful')
  153. self._verified = True
  154. if self.version is None:
  155. version = message['response']['docs'][0]['latestVersion']
  156. self.version = version
  157. logger.debug(f'{self}: Using newest version {version}')
  158. else:
  159. if self.implicit:
  160. logger.debug(f'{self}: No matching packages found')
  161. else:
  162. logger.warning(f'{self}: No matching packages found')
  163. self._verified = False
  164. else:
  165. self._verified = False
  166. logger.error(f'{self}: HTTP error {response.status} downloading pom')
  167. async def verify(self) -> bool:
  168. if not self._verified:
  169. await self._query_maven()
  170. return self._verified
  171. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  172. logger.info(f'Parsing {list_path}')
  173. with list_path.open('r') as f:
  174. for line in f.readlines():
  175. sections = line.strip().split(':')
  176. if len(sections) < 2 or len(sections) > 3:
  177. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  178. continue
  179. package = Package(
  180. sections[0],
  181. sections[1],
  182. sections[2] if len(sections) == 3 else None,
  183. )
  184. queue.put_nowait(package)
  185. if not package.artifactId.endswith('-jvm'):
  186. queue.put_nowait(
  187. Package(
  188. package.groupId,
  189. f'{package.artifactId}-jvm',
  190. package.version,
  191. True,
  192. )
  193. )
  194. async def download(package: Package, queue: asyncio.Queue) -> None:
  195. async with done_lock:
  196. skip = str(package) in done
  197. if skip:
  198. logger.info(f'{package}: Already downloaded. Skipping.')
  199. elif await package.verify():
  200. pom = await package.pom
  201. if pom:
  202. logger.info(f'{package}: Done')
  203. async with done_lock:
  204. for p in pom.packages:
  205. if not p.version:
  206. logger.warning(f'{p}: No version found!')
  207. logger.debug(f'{p}: Adding from BOM')
  208. done.add(str(p))
  209. else:
  210. logger.warning(f'{package}: No POM for package')
  211. async def worker(queue: asyncio.Queue) -> None:
  212. while True:
  213. package = await queue.get()
  214. await download(package, queue)
  215. await asyncio.sleep(random.random())
  216. queue.task_done()
  217. async def main() -> None:
  218. queue: asyncio.Queue = asyncio.Queue()
  219. tasks = []
  220. load_package_list(Path('package-list.txt'), queue)
  221. logger.debug(f'Starting {num_workers} workers')
  222. for i in range(num_workers):
  223. tasks.append(
  224. asyncio.create_task(
  225. worker(queue)
  226. )
  227. )
  228. await queue.join()
  229. logger.debug('Queue is empty. Cancelling workers')
  230. for task in tasks:
  231. task.cancel()
  232. await asyncio.gather(*tasks, return_exceptions=True)
  233. logger.info('Generating list of all packages')
  234. async with done_lock:
  235. with open(output_path, 'w') as f:
  236. for p in done:
  237. f.write(p + '\n')
  238. logger = logging.getLogger(__name__)
  239. if __name__ == '__main__':
  240. parser = argparse.ArgumentParser()
  241. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  242. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  243. parser.add_argument('-o', '--output', type=Path, default=Path('full-package-list.txt'))
  244. args = parser.parse_args()
  245. if args.verbosity == 0:
  246. log_level = 'WARNING'
  247. elif args.verbosity == 1:
  248. log_level = 'INFO'
  249. else:
  250. log_level = 'DEBUG'
  251. logging.basicConfig(level=log_level)
  252. num_workers = args.workers
  253. output_path = args.output
  254. asyncio.run(main())