Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

350 lines
11KB

  1. #!/bin/python3
  2. import re
  3. import argparse
  4. import logging
  5. import asyncio
  6. import subprocess
  7. import copy
  8. import aiohttp
  9. from pathlib import Path
  10. from xml.etree import ElementTree as ET
  11. ns = {'': 'http://maven.apache.org/POM/4.0.0'}
  12. ET.register_namespace('', ns[''])
  13. baseurl = 'https://search.maven.org'
  14. base_pom_path = Path('poms')
  15. mirrors = [
  16. "https://repo.maven.apache.org/maven2",
  17. "https://repo1.maven.org/maven2",
  18. "https://oss.sonatype.org/content/repositories/snapshots",
  19. "https://packages.confluent.io/maven",
  20. "https://registry.quarkus.io/maven",
  21. "https://plugins.gradle.org/m2",
  22. ]
  23. done: set[str] = set()
  24. done_lock = asyncio.Lock()
  25. num_workers = 50
  26. class PackagePOM:
  27. def __init__(self, package: 'Package', pom: str):
  28. logger.debug(f'{package}: Parsing POM')
  29. self.raw_root = ET.fromstring(pom)
  30. if (packaging := self.raw_root.find('packaging', ns)) is not None:
  31. self.packaging = packaging.text
  32. else:
  33. self.packaging = '??'
  34. self.is_bom = self.packaging == 'pom'
  35. if self.packaging == 'pom':
  36. root_copy = copy.deepcopy(self.raw_root)
  37. depman = root_copy.find('dependencyManagement', ns)
  38. if depman is not None:
  39. root_copy.extend(depman.findall('*'))
  40. root_copy.remove(depman)
  41. if (groupId := root_copy.find('groupId', ns)) is not None:
  42. groupId.text = f'tmp.{package.groupId}'
  43. else:
  44. logger.warning(f"{package}: No groupId tag in pom")
  45. if (artifactId := root_copy.find('groupId', ns)) is not None:
  46. artifactId.text = f'placeholder.{package.artifactId}'
  47. else:
  48. logger.warning(f"{package}: No artifactId tag in pom")
  49. # Add a dependency for the pom itself
  50. if (dependencies := root_copy.find('dependencies', ns)) is not None:
  51. self_dep = ET.SubElement(dependencies, 'dependency')
  52. ET.SubElement(self_dep, 'groupId').text = package.groupId
  53. ET.SubElement(self_dep, 'artifactId').text = package.artifactId
  54. ET.SubElement(self_dep, 'version').text = package.version
  55. else:
  56. logger.warning(f"{package}: No dependencies tag in pom")
  57. self.generated_root = root_copy
  58. else:
  59. self.generated_root = ET.fromstring(
  60. f"""
  61. <project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
  62. https://maven.apache.org/xsd/maven-4.0.0.xsd"
  63. xmlns="http://maven.apache.org/POM/4.0.0"
  64. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  65. <modelVersion>4.0.0</modelVersion>
  66. <groupId>tmp.{package.groupId}</groupId>
  67. <artifactId>placeholder-{package.artifactId}</artifactId>
  68. <version>{package.version}</version>
  69. <name>Package {package.artifactId}</name>
  70. <dependencies>
  71. <dependency>
  72. <groupId>{package.groupId}</groupId>
  73. <artifactId>{package.artifactId}</artifactId>
  74. <version>{package.version}</version>
  75. </dependency>
  76. </dependencies>
  77. </project>
  78. """
  79. )
  80. logger.debug(f'{package}: POM parsed')
  81. def write(self, f):
  82. tree = ET.ElementTree(self.generated_root)
  83. ET.indent(tree)
  84. tree.write(f)
  85. def get_property(self, prop: str):
  86. elem = self.raw_root.find(f'.//properties/{prop}', ns)
  87. if elem is not None:
  88. return elem.text
  89. else:
  90. return None
  91. def _package_from_xml_dep(self, dep: ET.Element):
  92. def prop_replace(match):
  93. prop = match.group(1)
  94. value = self.get_property(match.group(1))
  95. logger.debug(f'Replacing property {prop} with {value}')
  96. return value
  97. return Package(
  98. *[
  99. re.sub(
  100. r'\$\{([^\}]*)\}',
  101. prop_replace,
  102. elem.text or '' if (elem := dep.find(tag, ns)) is not None else '',
  103. )
  104. for tag in [
  105. 'groupId',
  106. 'artifactId',
  107. 'version',
  108. ]
  109. ]
  110. )
  111. @property
  112. def dependency_management(self) -> list['Package']:
  113. dependencies: list[Package] = []
  114. for dep in self.raw_root.find('dependencyManagement/dependencies', ns) or []:
  115. package = self._package_from_xml_dep(dep)
  116. dependencies.append(package)
  117. return dependencies
  118. class Package:
  119. _pom: PackagePOM | None = None
  120. _verified: bool = False
  121. def __init__(self, groupId: str, artifactId: str, version: str = None):
  122. self.groupId = groupId
  123. self.artifactId = artifactId
  124. self.version = version
  125. def __str__(self) -> str:
  126. return f'{self.groupId}:{self.artifactId}:{self.version or "----"}'
  127. def __eq__(self, other) -> bool:
  128. return (
  129. self.groupId == other.groupId
  130. and self.artifactId == other.artifactId
  131. and self.version == other.version
  132. )
  133. def __hash__(self) -> int:
  134. return hash((self.groupId, self.artifactId, self.version))
  135. @property
  136. def dir_path(self):
  137. group_path = self.groupId.replace(".", "/")
  138. return f'{group_path}/{self.artifactId}/{self.version}'
  139. @property
  140. def base_filename(self):
  141. return f'{self.artifactId}-{self.version}'
  142. async def download_file(self, extension):
  143. filepath = f'{self.dir_path}/{self.base_filename}.{extension}'
  144. async with aiohttp.ClientSession() as session:
  145. for mirror in mirrors:
  146. pom_url = f'{mirror}/{filepath}'
  147. logger.debug(f'{self}: Downloading {extension} from {pom_url}')
  148. async with session.get(pom_url) as response:
  149. if response.status == 200:
  150. logger.debug(f'{self}: {extension} downloaded')
  151. return await response.text()
  152. break
  153. else:
  154. logger.debug(f'{self}: HTTP error {response.status} from mirror {mirror}')
  155. else:
  156. logger.warning(f'{self}: File download of {extension} failed for all mirrors')
  157. return None
  158. @property
  159. async def pom(self) -> PackagePOM:
  160. if self._pom is not None:
  161. return self._pom
  162. if self.version is None:
  163. await self._query_maven()
  164. self._pom = PackagePOM(self, await self.download_file('pom'))
  165. return self._pom
  166. @property
  167. def _urlquery(self) -> str:
  168. q = f'g:{self.groupId}+AND+a:{self.artifactId}'
  169. if self.version is not None:
  170. q += f'+AND+v:{self.version}'
  171. return q
  172. async def _query_maven(self) -> None:
  173. url = f'{baseurl}/solrsearch/select?q={self._urlquery}&rows=1&wt=json'
  174. logger.debug(f'{self}: Querying maven at url {url}')
  175. async with aiohttp.ClientSession() as session:
  176. async with session.get(url) as response:
  177. if response.status == 200:
  178. message = await response.json()
  179. num = message['response']['numFound']
  180. if num:
  181. logger.debug(f'{self}: Query successful')
  182. self._verified = True
  183. if self.version is None:
  184. version = message['response']['docs'][0]['latestVersion']
  185. logger.debug(f'{self}: Using newest version {version}')
  186. self.version = version
  187. else:
  188. logger.warning(f'{self}: No matching packages found')
  189. self._verified = False
  190. else:
  191. self._verified = False
  192. logger.warning(f'{self}: HTTP error {response.status} downloading pom')
  193. async def verify(self) -> bool:
  194. if not self._verified:
  195. await self._query_maven()
  196. return self._verified
  197. def load_package_list(list_path: Path, queue: asyncio.Queue) -> None:
  198. logger.info(f'Parsing {list_path}')
  199. with list_path.open('r') as f:
  200. for line in f.readlines():
  201. sections = line.strip().split(':')
  202. if len(sections) < 2 or len(sections) > 3:
  203. logger.warning(f'Invalid package format "{line}". It should be "groupID:artifactID" or "groupID:artifactID:version"')
  204. continue
  205. package = Package(
  206. sections[0],
  207. sections[1],
  208. sections[2] if len(sections) == 3 else None,
  209. )
  210. queue.put_nowait(package)
  211. async def download(package: Package, queue: asyncio.Queue) -> None:
  212. async with done_lock:
  213. skip = str(package) in done
  214. if skip:
  215. logger.info(f'{package}: Already downloaded. Skipping.')
  216. elif await package.verify():
  217. async with done_lock:
  218. done.add(str(package))
  219. pom_dir = base_pom_path / f'{package.groupId}-{package.artifactId}-{package.version}'
  220. pom_path = pom_dir / 'pom.xml'
  221. pom_dir.mkdir(exist_ok=True)
  222. pom = await package.pom
  223. if not pom:
  224. return
  225. pom.write(pom_path)
  226. logger.info(f'{package}: Downloaded')
  227. if not pom.is_bom:
  228. for dep in pom.dependency_management:
  229. logger.info(f'{package}: Handling transitive dependency {dep}')
  230. await queue.put(dep)
  231. else:
  232. logger.warning(f'{package}: Package not found. Check package name and internet connection')
  233. async def worker(queue: asyncio.Queue) -> None:
  234. while True:
  235. package = await queue.get()
  236. await download(package, queue)
  237. queue.task_done()
  238. async def main() -> None:
  239. queue: asyncio.Queue = asyncio.Queue()
  240. tasks = []
  241. load_package_list(Path('package-list.txt'), queue)
  242. logger.debug(f'Starting {num_workers} workers')
  243. for i in range(num_workers):
  244. tasks.append(
  245. asyncio.create_task(
  246. worker(queue)
  247. )
  248. )
  249. await queue.join()
  250. logger.debug('Queue is empty. Cancelling workers')
  251. for task in tasks:
  252. task.cancel()
  253. await asyncio.gather(*tasks, return_exceptions=True)
  254. logger.info('Generating master POM')
  255. subprocess.call(['sh', 'generate_master_pom.sh'])
  256. logger = logging.getLogger(__name__)
  257. if __name__ == '__main__':
  258. parser = argparse.ArgumentParser()
  259. parser.add_argument('-w', '--workers', type=int, default=num_workers)
  260. parser.add_argument('-v', '--verbose', dest='verbosity', action='count', default=0)
  261. args = parser.parse_args()
  262. if args.verbosity == 0:
  263. log_level = 'WARNING'
  264. elif args.verbosity == 1:
  265. log_level = 'INFO'
  266. else:
  267. log_level = 'DEBUG'
  268. logging.basicConfig(level=log_level)
  269. num_workers = args.workers
  270. asyncio.run(main())