25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.

313 satır
9.6KB

  1. #!/bin/python3
  2. import re
  3. import argparse
  4. import logging
  5. import asyncio
  6. import subprocess
  7. import copy
  8. import aiohttp
  9. from pathlib import Path
  10. from xml.etree import ElementTree as ET
  11. ET.register_namespace('', 'http://maven.apache.org/POM/4.0.0')
  12. baseurl = 'https://search.maven.org'
  13. base_pom_path = Path('poms')
  14. mirrors = [
  15. "https://repo.maven.apache.org/maven2",
  16. "https://repo1.maven.org/maven2",
  17. "https://oss.sonatype.org/content/repositories/snapshots",
  18. "https://packages.confluent.io/maven",
  19. "https://registry.quarkus.io/maven",
  20. "https://plugins.gradle.org/m2",
  21. ]
  22. done: set[str] = set()
  23. done_lock = asyncio.Lock()
  24. num_workers = 50
  25. class PackagePOM:
  26. _dependencyManagement: list['Package'] = None
  27. def __init__(self, package: 'Package', pom: str):
  28. logger.debug(f'{package}: Parsing POM')
  29. self.raw_root = ET.fromstring(pom)
  30. packaging = self.raw_root.find('packaging')
  31. self.is_bom = True if packaging is not None and packaging.text == 'pom' else False
  32. if self.is_bom:
  33. root_copy = copy.deepcopy(self.raw_root)
  34. depman = root_copy.find('dependencyManagement')
  35. root_copy.extend(depman.findall('*'))
  36. root_copy.remove(depman)
  37. self.generated_root = root_copy
  38. else:
  39. self.generated_root = ET.fromstring(
  40. f"""
  41. <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
  42. https://maven.apache.org/xsd/maven-4.0.0.xsd"
  43. xmlns="http://maven.apache.org/POM/4.0.0"
  44. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  45. <modelVersion>4.0.0</modelVersion>
  46. <groupId>tmp.{package.groupId}</groupId>
  47. <artifactId>placeholder-{package.artifactId}</artifactId>
  48. <version>{package.version}</version>
  49. <name>Package {package.artifactId}</name>
  50. <dependencies>
  51. <dependency>
  52. <groupId>{package.groupId}</groupId>
  53. <artifactId>{package.artifactId}</artifactId>
  54. <version>{package.version}</version>
  55. </dependency>
  56. </dependencies>
  57. </project>
  58. """
  59. )
  60. def write(self, f):
  61. tree = ET.ElementTree(self.generated_root)
  62. ET.indent(tree)
  63. tree.write(f)
  64. def get_property(self, prop: str):
  65. elem = self.raw_root.find(f'.//properties/{prop}')
  66. if elem is not None:
  67. return elem.text
  68. else:
  69. return None
  70. @property
  71. def dependencyManagement(self) -> list['Package']:
  72. if self._dependencyManagement is not None:
  73. return self._dependencyManagement
  74. self._dependencyManagement = []
  75. def prop_replace(match):
  76. prop = match.group(1)
  77. value = self.get_property(match.group(1))
  78. logger.debug(f'Replacing property {prop} with {value}')
  79. return value
  80. for dep in self.raw_root.find('dependencyManagement/dependencies') or []:
  81. package = Package(
  82. *[
  83. re.sub(
  84. r'\$\{([^\}]*)\}',
  85. prop_replace,
  86. dep.find(tag).text,
  87. )
  88. for tag in [
  89. 'groupId',
  90. 'artifactId',
  91. 'version',
  92. ]
  93. ]
  94. )
  95. self._dependencyManagement.append(package)
  96. return self._dependencyManagement
  97. class Package:
  98. _pom: PackagePOM = None
  99. _verified: bool = False
  100. def __init__(self, groupId: str, artifactId: str, version: str = None):
  101. self.groupId = groupId
  102. self.artifactId = artifactId
  103. self.version = version
  104. def __str__(self) -> str:
  105. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  106. def __eq__(self, other) -> bool:
  107. return (
  108. self.groupId == other.groupId
  109. and self.artifactId == other.artifactId
  110. and self.version == other.version
  111. )
  112. def __hash__(self) -> str:
  113. return hash((self.groupId, self.artifactId, self.version))
  114. @property
  115. async def pom(self) -> ET:
  116. if self._pom is not None:
  117. return self._pom
  118. if self.version is None:
  119. self._query_maven()
  120. group_path = self.groupId.replace(".", "/")
  121. pom_path = f'{self.artifactId}-{self.version}.pom'
  122. filepath = f'{group_path}/{self.artifactId}/{self.version}/{pom_path}'
  123. async with aiohttp.ClientSession() as session:
  124. for mirror in mirrors:
  125. pom_url = f'{mirror}/{filepath}'
  126. logger.debug(f'{self}: Downloading pom from {pom_url}')
  127. async with session.get(pom_url) as response:
  128. if response.status == 200:
  129. logger.debug(f'{self}: POM downloaded')
  130. self._pom = PackagePOM(self, await response.text())
  131. break
  132. else:
  133. logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}')
  134. else:
  135. logger.warning(f'{self}: Failed for all mirrors')
  136. return self._pom
  137. @property
  138. def _urlquery(self) -> str:
  139. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  140. if self.version is not None:
  141. q += f'+AND+v:{self.version}'
  142. return q
  143. async def _query_maven(self) -> None:
  144. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  145. logger.debug(f'{self}: Querying maven at url {url}')
  146. async with aiohttp.ClientSession() as session:
  147. async with session.get(url) as response:
  148. if response.status == 200:
  149. message = await response.json()
  150. num = message['response']['numFound']
  151. if num:
  152. logger.debug(f'{self}: Query successful')
  153. self._verified = True
  154. if self.version is None:
  155. version = message['response']['docs'][0]['latestVersion']
  156. logger.debug(f'{self}: Using newest version {version}')
  157. self.version = version
  158. else:
  159. logger.warning(f'{self}: No matching packages found')
  160. self._verified = False
  161. else:
  162. self._verified = False
  163. logger.warning(f'{self}: HTTP error {response.status} downloading pom')
  164. async def verify(self) -> bool:
  165. if not self._verified:
  166. await self._query_maven()
  167. return self._verified
  168. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  169. logger.info(f'Parsing {list_path}')
  170. with list_path.open('r') as f:
  171. for line in f.readlines():
  172. sections = line.strip().split(':')
  173. if len(sections) < 2 or len(sections) > 3:
  174. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  175. continue
  176. package = Package(
  177. sections[0],
  178. sections[1],
  179. sections[2] if len(sections) == 3 else None,
  180. )
  181. queue.put_nowait(package)
  182. async def download(package: Package, queue: asyncio.Queue) -> None:
  183. async with done_lock:
  184. skip = str(package) in done
  185. if skip:
  186. logger.info(f'{package}: Already downloaded. Skipping.')
  187. elif await package.verify():
  188. async with done_lock:
  189. done.add(str(package))
  190. pom_dir = base_pom_path / str(package)
  191. pom_path = pom_dir / 'pom.xml'
  192. pom_dir.mkdir(exist_ok=True)
  193. pom = await package.pom
  194. if not pom:
  195. return
  196. pom.write(pom_path)
  197. logger.info(f'{package}: Downloaded')
  198. if not pom.is_bom:
  199. for dep in pom.dependencyManagement:
  200. logger.info(f'{package}: Handling transitive dependency {dep}')
  201. await queue.put(dep)
  202. else:
  203. logger.warning(f'{package}: Package not found. Check package name and internet connection')
  204. async def worker(queue: asyncio.Queue) -> None:
  205. while True:
  206. package = await queue.get()
  207. await download(package, queue)
  208. queue.task_done()
  209. async def main() -> None:
  210. queue = asyncio.Queue()
  211. tasks = []
  212. load_package_list(Path('package-list.txt'), queue)
  213. logger.debug(f'Starting {num_workers} workers')
  214. for i in range(num_workers):
  215. tasks.append(
  216. asyncio.create_task(
  217. worker(queue)
  218. )
  219. )
  220. await queue.join()
  221. logger.debug('Queue is empty. Cancelling workers')
  222. for task in tasks:
  223. task.cancel()
  224. await asyncio.gather(*tasks, return_exceptions=True)
  225. logger.info('Generating master POM')
  226. subprocess.call(['sh', 'generate_master_pom.sh'])
  227. logger = logging.getLogger(__name__)
  228. if __name__ == '__main__':
  229. parser = argparse.ArgumentParser()
  230. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  231. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  232. args = parser.parse_args()
  233. if args.verbosity == 0:
  234. log_level = 'WARNING'
  235. elif args.verbosity == 1:
  236. log_level = 'INFO'
  237. else:
  238. log_level = 'DEBUG'
  239. logging.basicConfig(level=log_level)
  240. num_workers = args.workers
  241. asyncio.run(main())