Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

330 рядки
10KB

  1. #!/bin/python3
  2. import re
  3. import argparse
  4. import logging
  5. import asyncio
  6. import subprocess
  7. import copy
  8. import aiohttp
  9. from pathlib import Path
  10. from xml.etree import ElementTree as ET
  11. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  12. ET.register_namespace('', ns[''])
  13. baseurl = 'https://search.maven.org'
  14. base_pom_path = Path('poms')
  15. mirrors = [
  16. "https://repo.maven.apache.org/maven2",
  17. "https://repo1.maven.org/maven2",
  18. "https://oss.sonatype.org/content/repositories/snapshots",
  19. "https://packages.confluent.io/maven",
  20. "https://registry.quarkus.io/maven",
  21. "https://plugins.gradle.org/m2",
  22. ]
  23. done: set[str] = set()
  24. done_lock = asyncio.Lock()
  25. num_workers = 50
  26. class PackagePOM:
  27. def __init__(self, package: 'Package', pom: str):
  28. logger.debug(f'{package}: Parsing POM')
  29. self.raw_root = ET.fromstring(pom)
  30. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  31. self.packaging = packaging.text
  32. else:
  33. self.packaging = '??'
  34. self.is_bom = self.packaging == 'pom'
  35. if self.packaging == 'pom':
  36. root_copy = copy.deepcopy(self.raw_root)
  37. depman = root_copy.find('dependencyManagement', ns)
  38. if depman is not None:
  39. root_copy.extend(depman.findall('*'))
  40. root_copy.remove(depman)
  41. self.generated_root = root_copy
  42. else:
  43. self.generated_root = ET.fromstring(
  44. f"""
  45. <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
  46. https://maven.apache.org/xsd/maven-4.0.0.xsd"
  47. xmlns="http://maven.apache.org/POM/4.0.0"
  48. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  49. <modelVersion>4.0.0</modelVersion>
  50. <groupId>tmp.{package.groupId}</groupId>
  51. <artifactId>placeholder-{package.artifactId}</artifactId>
  52. <version>{package.version}</version>
  53. <name>Package {package.artifactId}</name>
  54. <dependencies>
  55. <dependency>
  56. <groupId>{package.groupId}</groupId>
  57. <artifactId>{package.artifactId}</artifactId>
  58. <version>{package.version}</version>
  59. </dependency>
  60. </dependencies>
  61. </project>
  62. """
  63. )
  64. logger.debug(f'{package}: POM parsed')
  65. def write(self, f):
  66. tree = ET.ElementTree(self.generated_root)
  67. ET.indent(tree)
  68. tree.write(f)
  69. def get_property(self, prop: str):
  70. elem = self.raw_root.find(f'.//properties/{prop}', ns)
  71. if elem is not None:
  72. return elem.text
  73. else:
  74. return None
  75. def _package_from_xml_dep(self, dep: ET.Element):
  76. def prop_replace(match):
  77. prop = match.group(1)
  78. value = self.get_property(match.group(1))
  79. logger.debug(f'Replacing property {prop} with {value}')
  80. return value
  81. return Package(
  82. *[
  83. re.sub(
  84. r'\$\{([^\}]*)\}',
  85. prop_replace,
  86. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  87. )
  88. for tag in [
  89. 'groupId',
  90. 'artifactId',
  91. 'version',
  92. ]
  93. ]
  94. )
  95. @property
  96. def dependency_management(self) -> list['Package']:
  97. dependencies: list[Package] = []
  98. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  99. package = self._package_from_xml_dep(dep)
  100. dependencies.append(package)
  101. return dependencies
  102. class Package:
  103. _pom: PackagePOM | None = None
  104. _verified: bool = False
  105. def __init__(self, groupId: str, artifactId: str, version: str = None):
  106. self.groupId = groupId
  107. self.artifactId = artifactId
  108. self.version = version
  109. def __str__(self) -> str:
  110. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  111. def __eq__(self, other) -> bool:
  112. return (
  113. self.groupId == other.groupId
  114. and self.artifactId == other.artifactId
  115. and self.version == other.version
  116. )
  117. def __hash__(self) -> int:
  118. return hash((self.groupId, self.artifactId, self.version))
  119. @property
  120. def dir_path(self):
  121. group_path = self.groupId.replace(".", "/")
  122. return f'{group_path}/{self.artifactId}/{self.version}'
  123. @property
  124. def base_filename(self):
  125. return f'{self.artifactId}-{self.version}'
  126. async def download_file(self, extension):
  127. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  128. async with aiohttp.ClientSession() as session:
  129. for mirror in mirrors:
  130. pom_url = f'{mirror}/{filepath}'
  131. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  132. async with session.get(pom_url) as response:
  133. if response.status == 200:
  134. logger.debug(f'{self}: {extension} downloaded')
  135. return await response.text()
  136. break
  137. else:
  138. logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}')
  139. else:
  140. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  141. return None
  142. @property
  143. async def pom(self) -> PackagePOM:
  144. if self._pom is not None:
  145. return self._pom
  146. if self.version is None:
  147. await self._query_maven()
  148. self._pom = PackagePOM(self, await self.download_file('pom'))
  149. return self._pom
  150. @property
  151. def _urlquery(self) -> str:
  152. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  153. if self.version is not None:
  154. q += f'+AND+v:{self.version}'
  155. return q
  156. async def _query_maven(self) -> None:
  157. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  158. logger.debug(f'{self}: Querying maven at url {url}')
  159. async with aiohttp.ClientSession() as session:
  160. async with session.get(url) as response:
  161. if response.status == 200:
  162. message = await response.json()
  163. num = message['response']['numFound']
  164. if num:
  165. logger.debug(f'{self}: Query successful')
  166. self._verified = True
  167. if self.version is None:
  168. version = message['response']['docs'][0]['latestVersion']
  169. logger.debug(f'{self}: Using newest version {version}')
  170. self.version = version
  171. else:
  172. logger.warning(f'{self}: No matching packages found')
  173. self._verified = False
  174. else:
  175. self._verified = False
  176. logger.warning(f'{self}: HTTP error {response.status} downloading pom')
  177. async def verify(self) -> bool:
  178. if not self._verified:
  179. await self._query_maven()
  180. return self._verified
  181. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  182. logger.info(f'Parsing {list_path}')
  183. with list_path.open('r') as f:
  184. for line in f.readlines():
  185. sections = line.strip().split(':')
  186. if len(sections) < 2 or len(sections) > 3:
  187. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  188. continue
  189. package = Package(
  190. sections[0],
  191. sections[1],
  192. sections[2] if len(sections) == 3 else None,
  193. )
  194. queue.put_nowait(package)
  195. async def download(package: Package, queue: asyncio.Queue) -> None:
  196. async with done_lock:
  197. skip = str(package) in done
  198. if skip:
  199. logger.info(f'{package}: Already downloaded. Skipping.')
  200. elif await package.verify():
  201. async with done_lock:
  202. done.add(str(package))
  203. pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}'
  204. pom_path = pom_dir / 'pom.xml'
  205. pom_dir.mkdir(exist_ok=True)
  206. pom = await package.pom
  207. if not pom:
  208. return
  209. pom.write(pom_path)
  210. logger.info(f'{package}: Downloaded')
  211. if not pom.is_bom:
  212. for dep in pom.dependency_management:
  213. logger.info(f'{package}: Handling transitive dependency {dep}')
  214. await queue.put(dep)
  215. else:
  216. logger.warning(f'{package}: Package not found. Check package name and internet connection')
  217. async def worker(queue: asyncio.Queue) -> None:
  218. while True:
  219. package = await queue.get()
  220. await download(package, queue)
  221. queue.task_done()
  222. async def main() -> None:
  223. queue: asyncio.Queue = asyncio.Queue()
  224. tasks = []
  225. load_package_list(Path('package-list.txt'), queue)
  226. logger.debug(f'Starting {num_workers} workers')
  227. for i in range(num_workers):
  228. tasks.append(
  229. asyncio.create_task(
  230. worker(queue)
  231. )
  232. )
  233. await queue.join()
  234. logger.debug('Queue is empty. Cancelling workers')
  235. for task in tasks:
  236. task.cancel()
  237. await asyncio.gather(*tasks, return_exceptions=True)
  238. logger.info('Generating master POM')
  239. subprocess.call(['sh', 'generate_master_pom.sh'])
  240. logger = logging.getLogger(__name__)
  241. if __name__ == '__main__':
  242. parser = argparse.ArgumentParser()
  243. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  244. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  245. args = parser.parse_args()
  246. if args.verbosity == 0:
  247. log_level = 'WARNING'
  248. elif args.verbosity == 1:
  249. log_level = 'INFO'
  250. else:
  251. log_level = 'DEBUG'
  252. logging.basicConfig(level=log_level)
  253. num_workers = args.workers
  254. asyncio.run(main())