<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing with OASIS Tables v3.0 20080202//EN" "journalpub-oasis3.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:oasis="http://docs.oasis-open.org/ns/oasis-exchange/table" xml:lang="en" dtd-version="3.0">
  <front>
    <journal-meta><journal-id journal-id-type="publisher">GMD</journal-id><journal-title-group>
    <journal-title>Geoscientific Model Development</journal-title>
    <abbrev-journal-title abbrev-type="publisher">GMD</abbrev-journal-title><abbrev-journal-title abbrev-type="nlm-ta">Geosci. Model Dev.</abbrev-journal-title>
  </journal-title-group><issn pub-type="epub">1991-9603</issn><publisher>
    <publisher-name>Copernicus Publications</publisher-name>
    <publisher-loc>Göttingen, Germany</publisher-loc>
  </publisher></journal-meta>
    <article-meta>
      <article-id pub-id-type="doi">10.5194/gmd-11-1665-2018</article-id><title-group><article-title>Near-global climate simulation at 1 km resolution: establishing a performance baseline on 4888 GPUs with COSMO 5.0</article-title><alt-title>Near-global climate simulation at 1 km resolution</alt-title>
      </title-group><?xmltex \runningtitle{Near-global climate simulation at 1\,km resolution}?><?xmltex \runningauthor{O. Fuhrer et al.}?>
      <contrib-group>
        <contrib contrib-type="author" corresp="yes" rid="aff1">
          <name><surname>Fuhrer</surname><given-names>Oliver</given-names></name>
          <email>oliver.fuhrer@meteoswiss.ch</email>
        <ext-link>https://orcid.org/0000-0002-0682-1374</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff2">
          <name><surname>Chadha</surname><given-names>Tarun</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff3">
          <name><surname>Hoefler</surname><given-names>Torsten</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff3">
          <name><surname>Kwasniewski</surname><given-names>Grzegorz</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff1">
          <name><surname>Lapillonne</surname><given-names>Xavier</given-names></name>
          
        <ext-link>https://orcid.org/0000-0001-6114-4321</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff4">
          <name><surname>Leutwyler</surname><given-names>David</given-names></name>
          
        <ext-link>https://orcid.org/0000-0002-5141-1737</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff4">
          <name><surname>Lüthi</surname><given-names>Daniel</given-names></name>
          
        <ext-link>https://orcid.org/0000-0001-6638-691X</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff1">
          <name><surname>Osuna</surname><given-names>Carlos</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff4">
          <name><surname>Schär</surname><given-names>Christoph</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff5 aff6">
          <name><surname>Schulthess</surname><given-names>Thomas C.</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff6">
          <name><surname>Vogt</surname><given-names>Hannes</given-names></name>
          
        </contrib>
        <aff id="aff1"><label>1</label><institution>Federal Institute of Meteorology and Climatology, MeteoSwiss, Zurich, Switzerland</institution>
        </aff>
        <aff id="aff2"><label>2</label><institution>ITS Research Informatics, ETH Zurich, Switzerland</institution>
        </aff>
        <aff id="aff3"><label>3</label><institution>Scalable Parallel Computing Lab, ETH Zurich, Switzerland</institution>
        </aff>
        <aff id="aff4"><label>4</label><institution>Institute for Atmospheric and Climate Science, ETH Zurich, Switzerland</institution>
        </aff>
        <aff id="aff5"><label>5</label><institution>Institute for Theoretical Physics, ETH Zurich, Switzerland</institution>
        </aff>
        <aff id="aff6"><label>6</label><institution>Swiss National Supercomputing Centre, CSCS, Lugano, Switzerland</institution>
        </aff>
      </contrib-group>
      <author-notes><corresp id="corr1">Oliver Fuhrer (oliver.fuhrer@meteoswiss.ch)</corresp></author-notes><pub-date><day>2</day><month>May</month><year>2018</year></pub-date>
      
      <volume>11</volume>
      <issue>4</issue>
      <fpage>1665</fpage><lpage>1681</lpage>
      <history>
        <date date-type="received"><day>16</day><month>September</month><year>2017</year></date>
           <date date-type="rev-request"><day>5</day><month>October</month><year>2017</year></date>
           <date date-type="rev-recd"><day>7</day><month>February</month><year>2018</year></date>
           <date date-type="accepted"><day>8</day><month>February</month><year>2018</year></date>
      </history>
      <permissions>
        
        
      <license license-type="open-access"><license-p>This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this licence, visit <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link></license-p></license></permissions><self-uri xlink:href="https://gmd.copernicus.org/articles/.html">This article is available from https://gmd.copernicus.org/articles/.html</self-uri><self-uri xlink:href="https://gmd.copernicus.org/articles/.pdf">The full text article is available as a PDF file from https://gmd.copernicus.org/articles/.pdf</self-uri>
      <abstract>
    <p id="d1e206">The best hope for reducing long-standing global climate model biases is by
increasing resolution to the kilometer scale. Here we present results from an
ultrahigh-resolution non-hydrostatic climate model for a near-global setup
running on the full Piz Daint supercomputer on 4888 GPUs (graphics
processing units). The dynamical core of the model has been completely
rewritten using a domain-specific language (DSL) for performance portability
across different hardware architectures. Physical parameterizations and
diagnostics have been ported using compiler directives. To our knowledge this
represents the first complete atmospheric model being run entirely on
accelerators on this scale. At a grid spacing of 930 m (1.9 km), we achieve
a simulation throughput of 0.043 (0.23) simulated years per day and an energy
consumption of 596 MWh per simulated year. Furthermore, we propose a new
memory usage efficiency (MUE) metric that considers how efficiently the
memory bandwidth – the dominant bottleneck of climate codes – is being
used.</p>
  </abstract>
    </article-meta>
  </front>
<body>
      

<sec id="Ch1.S1" sec-type="intro">
  <title>Introduction</title>
      <p id="d1e216">Should global warming occur at the upper end of the range of current projections, the local impacts
of unmitigated climate change would be dramatic.
Particular
concerns relate to the projected sea-level rise, increases in the incidence of extreme
events such as heat waves and floods, and changes in the availability of water resources
and the occurrence of droughts <xref ref-type="bibr" rid="bib1.bibx49" id="paren.1"/>.</p>
      <p id="d1e222">Current climate projections are mostly based on global climate models (GCMs).
These models represent the coupled atmosphere–ocean–land system and integrate
the governing equations, for instance, for a set of prescribed emissions scenarios. Despite
significant progress during the last decades, uncertainties are still large.
For example, current estimates of the equilibrium global mean surface warming for doubled greenhouse gas
concentrations range between 1.5 and 4.5 <inline-formula><mml:math id="M1" display="inline"><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup></mml:math></inline-formula>C
<xref ref-type="bibr" rid="bib1.bibx49" id="paren.2"/>. On regional scales and in terms of the hydrological cycle, the
uncertainties are even larger.
Reducing the uncertainties of climate change projections, in order to make optimal
mitigation and adaptation decisions, is thus urgent and has a tremendous economic
value <xref ref-type="bibr" rid="bib1.bibx31" id="paren.3"/>.</p>
      <?pagebreak page1666?><p id="d1e240">How can the uncertainties of climate projections be reduced?
There is overwhelming evidence
from the literature that the leading cause of uncertainty is the representation of
clouds, largely due to their influence upon the reflection of incoming solar radiation
<xref ref-type="bibr" rid="bib1.bibx9 bib1.bibx6 bib1.bibx58" id="paren.4"/>.
Horizontal
resolutions of current global climate models are typically in the range
of 50–200 km. At this resolution, clouds must be parametrized, based
on theoretical and semiempirical considerations. Refining the
resolution to the kilometer scale would allow the explicit representation of deep convective
clouds (thunderstorms and rain showers; e.g., Fig. <xref ref-type="fig" rid="Ch1.F1"/>). Studies using regional climate models
demonstrate that at this resolution, the representation of precipitation is
dramatically improved <xref ref-type="bibr" rid="bib1.bibx35 bib1.bibx4" id="paren.5"/>. The representation of
shallow cumulus cloud layers, which are common over significant fractions of the
tropical oceans, requires even higher resolution. The United States National
academy of sciences has thus recommended <xref ref-type="bibr" rid="bib1.bibx47" id="paren.6"/> developing “high-end
global models that execute efficiently ... , enabling cloud-resolving atmospheric
resolutions (2–4 km) and eddy-resolving ocean resolutions (5 km)” in the near future.</p>

      <?xmltex \floatpos{t}?><fig id="Ch1.F1" specific-use="star"><caption><p id="d1e256">Visualization of a baroclinic wave at day 10 of a simulation with 930 m grid spacing. White
shading: volume rendering of cloud ice, cloud water, and graupel <inline-formula><mml:math id="M2" display="inline"><mml:mrow><mml:mo>≥</mml:mo><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">3</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> g kg<inline-formula><mml:math id="M3" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>. Blue shading: isosurface of rain and snow
hydrometeors <inline-formula><mml:math id="M4" display="inline"><mml:mrow><mml:mo>≥</mml:mo><mml:mn mathvariant="normal">4</mml:mn><mml:mo>×</mml:mo><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> g kg<inline-formula><mml:math id="M5" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>. The white contours denote surface pressure.</p></caption>
        <?xmltex \igopts{width=426.791339pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/1665/2018/gmd-11-1665-2018-f01.jpg"/>

      </fig>

      <p id="d1e326">While the scientific prospects of such an undertaking are highly
promising, the computational implications are significant.
Increasing the horizontal resolution from 50 to 2 km increases the
computational effort by at least a factor of <inline-formula><mml:math id="M6" display="inline"><mml:mrow><mml:msup><mml:mn mathvariant="normal">25</mml:mn><mml:mn mathvariant="normal">3</mml:mn></mml:msup><mml:mo>=</mml:mo><mml:mn mathvariant="normal">15</mml:mn></mml:mrow></mml:math></inline-formula> 000. Such simulations will only
be possible on future extreme-scale high-performance computers. Furthermore, power
constraints have been driving the widespread adoption of many-core accelerators in
leading edge supercomputers and the weather and climate community is struggling to
migrate the large existing codes to these architectures and use them efficiently.</p>
      <p id="d1e344">But what does efficient mean? While concerns of the total cost of ownership
of a high-performance computing (HPC) system have shifted the focus from peak floating
point performance towards improving power efficiency, it
is not clear what the right efficiency metric is for a fully fledged
climate model.
Today, floating point operations are around 100<inline-formula><mml:math id="M7" display="inline"><mml:mo>×</mml:mo></mml:math></inline-formula> cheaper than data
movement in terms of time and 1000<inline-formula><mml:math id="M8" display="inline"><mml:mo>×</mml:mo></mml:math></inline-formula> cheaper in terms of energy, depending on where
the data come from <xref ref-type="bibr" rid="bib1.bibx7 bib1.bibx59" id="paren.7"/>.
Thus, while
focusing on floating point operations was very relevant 25 years ago, it has
lost most of this relevance today. Instead, domain-specific metrics
may be much more applicable to evaluate and compare application
performance. A metric often used for climate models is the
throughput achieved by the simulation measured in simulated years
per wall clock day (SYPD; see <xref ref-type="bibr" rid="bib1.bibx2" id="text.8"/> for a detailed discussion on metrics).
For global atmospheric models, a suitable near-term target is to conduct decade-long
simulations and to participate in the Atmospheric Model Intercomparison Project (AMIP) effort.
<fn id="Ch1.Footn1"><p id="d1e367">This is part of the Coupled Model Intercomparison Project (CMIP6; see <xref ref-type="bibr" rid="bib1.bibx20" id="altparen.9"/>).</p></fn>
Such simulations require a 36-year long simulation for the period 1979–2014, driven
by observed atmospheric greenhouse gas concentrations and sea-surface temperatures.
Within the context of current climate modeling centers, such a simulation would be
feasible for an SYPD greater than or equal to 0.2–0.3.
At such a rate the simulation would take up
to several months.
However, domain-specific metrics such as SYPD are
very dependent on the specific problem and approximations in the code
under consideration and are often hard to compare.
Ideally comparisons would be performed for production-quality global atmospheric models that have
been extensively validated for climate simulations and cover the full (non-hydrostatic and compressible)
dynamics and the entire suite of model parameterizations.</p>
      <p id="d1e374">With the SYPD metric alone, it is hard to assess how efficiently a
particular computing platform is used. Efficiency of use is particularly
important because, on the typical scale of climate simulations,
computing resources are very costly and energy intensive. Thus, running
high-resolution climate simulations also faces a significant computer
science problem when it comes to computational efficiency.  As mentioned
before, floating point efficiency is often not relevant for
state-of-the-art climate codes. Not only does counting floating point operations per second (flop s<inline-formula><mml:math id="M9" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>) not
reflect the actual (energy) costs well, but the typical climate code
has very low arithmetic intensity (the ratio of floating point operations to consumed memory
bandwidth). Attempts to increase the
arithmetic intensity may increase the floating point rate, but it is not
clear if it improves any of the significant metrics (e.g., SYPD).
However, solely focusing on memory bandwidth can also be misleading.
Thus, we propose memory usage efficiency (MUE), a new metric that
considers the efficiency of the code's implementation with respect to
input/output (I/O) complexity bounds as well as the achieved system
memory bandwidth.</p>
      <p id="d1e389">In summary, the next grand challenge of climate modeling is refining the grid spacing of
the production model codes to the kilometer scale, as it will allow addressing
long-standing
open questions and uncertainties on the impact of anthropogenic effects on the future of our planet.
Here, we address this great challenge and demonstrate the first simulation of a
production-level atmospheric model,
delivering 0.23 (0.043) SYPD at a grid spacing of 1.9 km (930 m), sufficient
for AMIP-type simulations. Further, we evaluate the efficiency
of these simulations using a new memory usage efficiency metric.</p>
</sec>
<sec id="Ch1.S2">
  <title>Current state of the art</title>
      <p id="d1e398">Performing global kilometer-scale climate simulations is an ambitious goal <xref ref-type="bibr" rid="bib1.bibx50" id="paren.10"/>,
but a few kilometer-scale landmark simulations have already been performed. While arguably not the
most relevant metric, many of the studies have reported sustained floating point performance.
In 2007, <xref ref-type="bibr" rid="bib1.bibx45" id="text.11"/>
performed a week-long simulation with a horizontal grid spacing of 3.5 km with the Nonhydrostatic Icosahedral
Atmospheric Model (NICAM) on the Earth Simulator, and in 2013 <xref ref-type="bibr" rid="bib1.bibx46" id="text.12"/> performed a 12 h long
simulation at a grid spacing of 870 m on the K computer, achieving 230 Tflop s<inline-formula><mml:math id="M10" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> double-precision performance.<fn id="Ch1.Footn2"><p id="d1e422">1 Tflop s<inline-formula><mml:math id="M11" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> = <inline-formula><mml:math id="M12" display="inline"><mml:mrow><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">12</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula> flop s<inline-formula><mml:math id="M13" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>.</p></fn>
In 2014, <xref ref-type="bibr" rid="bib1.bibx60" id="text.13"/> performed a 20-day long simulation with a horizontal grid spacing of 3 km
with the Model for Prediction Across Scales (MPAS) and later, in 2015, participated in the Next Generation Global
Prediction System (NGGPS) model intercomparison project <xref ref-type="bibr" rid="bib1.bibx44" id="paren.14"/> at the same resolution and
achieved 0.16 SYPD on the full National Energy Research Scientific Computing Center (NERSC) Edison system.
In 2015, <xref ref-type="bibr" rid="bib1.bibx10" id="text.15"/> simulated several months of an
extended aquaplanet channel at a grid spacing of 4 km using the System for Atmospheric Modeling (SAM).
<xref ref-type="bibr" rid="bib1.bibx68" id="text.16"/> were the first to deploy a weather code on the
TSUBAME system accelerated using graphics processing units (GPUs). The fully rewritten NICAM model
sustained a double-precision performance of 60 Tflop s<inline-formula><mml:math id="M14" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> on 2560 GPUs of
the TSUBAME 2.5 supercomputer.
In 2016, <xref ref-type="bibr" rid="bib1.bibx67" id="text.17"/> implemented a fully implicit dynamical core
at 488 m grid spacing in a <inline-formula><mml:math id="M15" display="inline"><mml:mi mathvariant="italic">β</mml:mi></mml:math></inline-formula>-plane channel
achieving 7.95 Pflop s<inline-formula><mml:math id="M16" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> on the TaihuLight supercomputer.<fn id="Ch1.Footn3"><p id="d1e509">1 Pflop s<inline-formula><mml:math id="M17" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> = <inline-formula><mml:math id="M18" display="inline"><mml:mrow><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">15</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula> flop s<inline-formula><mml:math id="M19" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>.</p></fn></p>
      <p id="d1e547">The optimal numerical approach for high-resolution climate models may depend on the details of the target
hardware architecture.
For a more thorough analysis, the physical propagation of information in the atmosphere has to be considered.
While many limited-area
atmospheric models use a filtered set of the governing equations that
suppresses sound propagation, these approaches are not precise enough for global applications <xref ref-type="bibr" rid="bib1.bibx13" id="paren.18"/>.
Thus, the largest physical group velocity to face in global atmospheric models is the speed of sound.
The speed of sound in the atmosphere amounts to between 280 and 360 m s<inline-formula><mml:math id="M20" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>.
Thus in a time span of an hour, the minimum distance across which information needs to be exchanged
amounts to about 1500 km, corresponding to a tiny fraction of 1.4 % of the earth's surface.
However, many numerical schemes
exchange information at much larger rates. For instance, the popular pseudo-spectral methodology
(e.g., <xref ref-type="bibr" rid="bib1.bibx18" id="altparen.19"/>) requires Legendre and
Fourier transforms between the physical grid and the spherical harmonics and thus couples them globally at each time
step. Similarly, semi-Lagrangian semi-implicit time-integration methods require the solution of a Helmholtz-type
elliptical equation <xref ref-type="bibr" rid="bib1.bibx14" id="paren.20"/>, which implies global communication at each time step. Both methods use
long time steps, which may partially mitigate the additional communication overhead.
While these methods have enabled fast and accurate solutions at intermediate resolution in the past, they are likely not suited for
ultrahigh-resolution models, as the rate of communication typically increases proportionally to the horizontal
mesh size. Other approaches use time-integration methods with only locally implicit solvers (e.g., <xref ref-type="bibr" rid="bib1.bibx25" id="altparen.21"/>),
where they try to retain the advantages of fully implicit methods but only require
nearest-neighbor communication.</p>
      <p id="d1e574">The main advantage of implicit and semi-implicit approaches is that they allow large acoustic Courant
numbers <inline-formula><mml:math id="M21" display="inline"><mml:mrow><mml:msub><mml:mi mathvariant="italic">α</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>c</mml:mi><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>t</mml:mi><mml:mo>/</mml:mo><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:math></inline-formula>, where <inline-formula><mml:math id="M22" display="inline"><mml:mi>c</mml:mi></mml:math></inline-formula> denotes the speed of sound and <inline-formula><mml:math id="M23" display="inline"><mml:mrow><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:math></inline-formula>
and <inline-formula><mml:math id="M24" display="inline"><mml:mrow><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:math></inline-formula> the time step and the grid spacing, respectively. For instance, <xref ref-type="bibr" rid="bib1.bibx67" id="text.22"/> use an acoustic
Courant number up to 177; i.e., their time step is 177 times larger than in a standard explicit integration
(this estimate is based on the <inline-formula><mml:math id="M25" display="inline"><mml:mrow><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>x</mml:mi><mml:mo>=</mml:mo><mml:mn mathvariant="normal">488</mml:mn></mml:mrow></mml:math></inline-formula> m simulation with <inline-formula><mml:math id="M26" display="inline"><mml:mrow><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn mathvariant="normal">240</mml:mn></mml:mrow></mml:math></inline-formula> s).
In their case, such a large time step may be chosen, as the sound propagation is not relevant for weather phenomena.</p>
      <p id="d1e661">However, although implicit methods are unconditionally stable (stable irrespectively of the time step used),
there are other limits to choosing the time step.
In order to appropriately represent advective processes with typical velocities up to
100 m s<inline-formula><mml:math id="M27" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> and associated phase changes (e.g.,<?pagebreak page1668?> condensation and
fallout of precipitation), numerical principles dictate an upper limit to the advective Courant number
<inline-formula><mml:math id="M28" display="inline"><mml:mrow><mml:msub><mml:mi mathvariant="italic">α</mml:mi><mml:mi>u</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mfenced close="|" open="|"><mml:mi>u</mml:mi></mml:mfenced><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>t</mml:mi><mml:mo>/</mml:mo><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:math></inline-formula>, where <inline-formula><mml:math id="M29" display="inline"><mml:mrow><mml:mfenced close="|" open="|"><mml:mi>u</mml:mi></mml:mfenced></mml:mrow></mml:math></inline-formula> denotes the largest
advective wind speed, e.g., <xref ref-type="bibr" rid="bib1.bibx19" id="text.23"/>. The specific limit for <inline-formula><mml:math id="M30" display="inline"><mml:mrow><mml:msub><mml:mi mathvariant="italic">α</mml:mi><mml:mi>u</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> depends on the numerical
implementation and time-stepping procedures. For instance, semi-Lagrangian schemes may produce accurate
results
for values of <inline-formula><mml:math id="M31" display="inline"><mml:mrow><mml:msub><mml:mi mathvariant="italic">α</mml:mi><mml:mi>u</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> up to 4 or even larger. For most standard implementations, however, there are much more
stringent limits, often requiring that <inline-formula><mml:math id="M32" display="inline"><mml:mrow><mml:msub><mml:mi mathvariant="italic">α</mml:mi><mml:mi>u</mml:mi></mml:msub><mml:mo>≤</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:math></inline-formula>. For the recent study of <xref ref-type="bibr" rid="bib1.bibx67" id="text.24"/>, who used a
fully implicit scheme with a time step of 240 s, the advective Courant number reaches values of up to
<inline-formula><mml:math id="M33" display="inline"><mml:mrow><mml:msub><mml:mi mathvariant="italic">α</mml:mi><mml:mi>u</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mn mathvariant="normal">4.2</mml:mn></mml:mrow></mml:math></inline-formula> and 17.2 for the <inline-formula><mml:math id="M34" display="inline"><mml:mrow><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>x</mml:mi><mml:mo>=</mml:mo><mml:mn mathvariant="normal">2</mml:mn></mml:mrow></mml:math></inline-formula> km and 488 m simulation, respectively.
Depending upon the numerical approximation,
such a large Courant
number will imply significant phase errors <xref ref-type="bibr" rid="bib1.bibx17" id="paren.25"/> or even a reduction in
effective model resolution <xref ref-type="bibr" rid="bib1.bibx55" id="paren.26"/>. In order to produce accurate results, the scheme
would
require a significantly smaller time step and would require reducing
the time step with decreasing grid spacing.
For the NGGPS intercomparison the hydrostatic Integrated Forecasting System (IFS) model used a time step of 120 s at
3.125 km <xref ref-type="bibr" rid="bib1.bibx44" id="paren.27"/>, the regional semi-implicit,
semi-Lagrangian, fully non-hydrostatic model MC2 used a time step of 30 s at 3.0 km <xref ref-type="bibr" rid="bib1.bibx5" id="paren.28"/>, and Météo France
in their semi-implicit Application of Research to Operations at Mesoscale (AROME) model use a time step of 45 and 60 s for their 1.3 and 2.5 km
implementations, respectively. Since the IFS model is not a non-hydrostatic model,
we conclude that even for fully implicit, global,
convection-resolving climate simulations at <inline-formula><mml:math id="M35" display="inline"><mml:mo>∼</mml:mo></mml:math></inline-formula> 1–2 km grid spacing, a time step larger than
40–60 s cannot be considered a viable option.</p>
      <p id="d1e805">In the current study we use the split-explicit time-stepping scheme with an underlying Runge–Kutta time step
<xref ref-type="bibr" rid="bib1.bibx65" id="paren.29"/> of the Consortium for Small-Scale Modeling (COSMO) model (see Sect. <xref ref-type="sec" rid="Ch1.S3.SS1"/>).
This scheme uses sub-time stepping for the fast (acoustic) modes with
a small
time step <inline-formula><mml:math id="M36" display="inline"><mml:mrow><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi mathvariant="italic">τ</mml:mi></mml:mrow></mml:math></inline-formula> and explicit time stepping for all other modes with a large time step
<inline-formula><mml:math id="M37" display="inline"><mml:mrow><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mi>n</mml:mi><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi mathvariant="italic">τ</mml:mi></mml:mrow></mml:math></inline-formula>. Most of the computations are required on the large time step, with
<inline-formula><mml:math id="M38" display="inline"><mml:mrow><mml:msub><mml:mi mathvariant="italic">α</mml:mi><mml:mi>u</mml:mi></mml:msub><mml:mo>≤</mml:mo><mml:mn mathvariant="normal">2</mml:mn></mml:mrow></mml:math></inline-formula>, depending on the combination of time-integration and advection scheme.
In contrast to semi-implicit, semi-Lagrangian, and implicit schemes,
the approach does not require solving a global equation and all computations are local (i.e., vertical columns
exchange information merely with their neighbors). The main advantage of this approach is that it exhibits
– at least in theory –  perfect weak scaling.<fn id="Ch1.Footn4"><p id="d1e857">Weak scaling is defined as how the solution time varies
with the number of processing elements for a fixed problem size per processing elements. This is in contrast to
strong scaling, where the total problem size is kept fixed.</p></fn> This also applies to the communication load per sub-domain,
when applying horizontal domain decomposition.</p>
</sec>
<sec id="Ch1.S3">
  <title>Methods</title>
<sec id="Ch1.S3.SS1">
  <title>Model description</title>

      <?xmltex \floatpos{t}?><fig id="Ch1.F2" specific-use="star"><caption><p id="d1e874">Illustration of the computational complexity of the COSMO dynamical core, using a CDAG. The nodes of the graph represent computational kernels (blue ellipses)
that can have multiple input and output variables,
halo updates (green rectangles), and boundary condition operations (orange rectangles). The edges of the graph
represent data dependencies. Since the dynamical core of COSMO has been written using a DSL, the CDAG can be produced automatically using an analysis back end of the DSL compiler. The lengthy serial
section in the middle of the figure corresponds to the sound waves sub-stepping in the fast-wave solver. The parallel
section in the upper left corresponds to the advection of the seven tracer variables.
CDAGs can automatically be produced from C++ code.</p></caption>
          <?xmltex \igopts{width=341.433071pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/1665/2018/gmd-11-1665-2018-f02.pdf"/>

        </fig>

      <p id="d1e883">For the simulations presented in this paper, we use a refactored version 5.0 of the regional weather and climate code developed
by COSMO <xref ref-type="bibr" rid="bib1.bibx12 bib1.bibx16 bib1.bibx61" id="paren.30"/>
and – for the climate mode – the Climate Limited-area Modelling (CLM) Community <xref ref-type="bibr" rid="bib1.bibx11" id="paren.31"/>.
At kilometer-scale resolution, COSMO is used for numerical weather prediction
<xref ref-type="bibr" rid="bib1.bibx56 bib1.bibx3" id="paren.32"/> and has been thoroughly evaluated for climate simulations
in Europe <xref ref-type="bibr" rid="bib1.bibx4 bib1.bibx41" id="paren.33"/>.
The COSMO
model is based on the thermo-hydrodynamical equations describing non-hydrostatic, fully
compressible flow in a moist atmosphere.
It solves the fully compressible Euler equations using finite difference discretization in space <xref ref-type="bibr" rid="bib1.bibx16 bib1.bibx61" id="paren.34"/>.
For time stepping, it uses a split-explicit three-stage second-order Runge–Kutta discretization to integrate the prognostic
variables forward in time <xref ref-type="bibr" rid="bib1.bibx65" id="paren.35"/>. For horizontal advection, a fifth-order upwind scheme is used
for the dynamic variables and a Bott scheme <xref ref-type="bibr" rid="bib1.bibx8" id="paren.36"/> is used for the moisture variables. The model includes
a full set of physical parametrizations required for real-case simulations. For this study, we use a single-moment
bulk cloud microphysics scheme that uses five species (cloud water, cloud ice, rain, snow, and graupel) described
in <xref ref-type="bibr" rid="bib1.bibx54" id="text.37"/>. For the full physics simulations, additionally a radiation scheme <xref ref-type="bibr" rid="bib1.bibx57" id="paren.38"/>, a soil
model <xref ref-type="bibr" rid="bib1.bibx29" id="paren.39"/>, and a sub-grid-scale turbulence scheme <xref ref-type="bibr" rid="bib1.bibx53" id="paren.40"/> are switched on.</p>
      <p id="d1e920">The COSMO model is a regional model and physical space is discretized in a rotated latitude–longitude–height
coordinate system and projected onto a regular, structured, three-dimensional grid (IJK).
In the vertical, a
terrain-following coordinate supports an arbitrary topography.
The spatial discretization applied to solve the governing equations generates so called stencil
computations (operations that require data from neighboring grid points).
Due to the strong anisotropy of the atmosphere, implicit methods are employed in the vertical
direction,
as opposed to the explicit methods applied to the horizontal operators.
The numerical discretization yields a large number of mixed compact horizontal stencils and vertical implicit
solvers,
strongly connected via the data dependencies on the prognostic variables.
Figure <xref ref-type="fig" rid="Ch1.F2"/> shows the data dependency graph of the computational kernels of the
dynamical core of COSMO used in this setup,
where each computational kernel corresponds to a complex set of fused stencil operations in order to maximize the
data locality of the algorithm. Each computational kernel typically has multiple input and output fields and thus
data dependencies as indicated with the edges of the Computational Directed Acyclic Graph (CDAG) shown in Fig.<?pagebreak page1669?> <xref ref-type="fig" rid="Ch1.F2"/>.
Maximizing the data locality of these stencil computations is crucial to optimize the time to solution of
the application.</p>
      <p id="d1e927">To enable the running of COSMO on hybrid high-performance computing systems with GPU-accelerated
compute nodes, we rewrote the dynamical core of
the model, which implements the solution to the non-hydrostatic Euler
equations, from Fortran to C++ <xref ref-type="bibr" rid="bib1.bibx23" id="paren.41"/>. This enabled us to introduce a new
C++ template library-based domain-specific language (DSL) we call
Stencil Loop Language (STELLA) <xref ref-type="bibr" rid="bib1.bibx27" id="paren.42"/> to provide a performance-portable implementation
for the stencil algorithmic motifs by abstracting hardware-dependent
optimization.  Specialized back ends of the library produce efficient
code for the target computing architecture. Additionally, the DSL
supports an analysis back end that records the access patterns and data
dependencies of the kernels shown in Fig. <xref ref-type="fig" rid="Ch1.F2"/>. This
information is then used to determine the amount of memory accesses and to assess the memory
utilization efficiency.
For GPUs, the STELLA back end
is written in CUDA, and other parts of the refactored COSMO
implementation use OpenACC directives <xref ref-type="bibr" rid="bib1.bibx37" id="paren.43"/>.</p>
      <p id="d1e942">Thanks to this refactored implementation of the model and the STELLA
DSL, COSMO is the first fully capable weather and climate  model to go
operational on GPU-accelerated supercomputers <xref ref-type="bibr" rid="bib1.bibx38" id="paren.44"/>.
In the simulations we analyze here, the model was scaled to nearly
5000 GPU-accelerated nodes of the Piz Daint supercomputer
at the Swiss National Supercomputing Centre.<fn id="Ch1.Footn5"><p id="d1e948">See
<uri>https://www.cscs.ch/computers/piz-daint/</uri> for more
information.</p></fn>
To our
knowledge, COSMO is still the only production-level weather and climate
model capable of running on GPU-accelerated hardware architectures.</p>
</sec>
<sec id="Ch1.S3.SS2">
  <title>Hardware description</title>
      <p id="d1e961">The experiments were performed on the hybrid partition of the Piz Daint
supercomputer, located at the Swiss National Supercomputing Centre (CSCS) in
Lugano.
At the time when our simulation was performed, this supercomputer consisted of a multi-core partition,
which was not used in
this study, as well as a hybrid partition
of 4'936 Cray XC50 nodes. These hybrid nodes are equipped with an Intel E5-2690
v3 CPU (code name Haswell) and a PCIe version of the NVIDIA Tesla P100 GPU
(code name Pascal) with 16 GBytes second-generation high-bandwidth memory (HBM2).<fn id="Ch1.Footn6"><p id="d1e964">1 GByte = <inline-formula><mml:math id="M39" display="inline"><mml:mrow><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">9</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula> Bytes.</p></fn>
The nodes of both partitions are interconnected in one fabric (based on Aries
technology) in a Dragonfly topology <xref ref-type="bibr" rid="bib1.bibx1" id="paren.45"/>.</p>
</sec>
<sec id="Ch1.S3.SS3">
  <title>Energy measurements</title>
      <?pagebreak page1670?><p id="d1e988">We measure the energy to solution of our production-level runs on Piz Daint using the methodology
established and described in detail by <xref ref-type="bibr" rid="bib1.bibx21" id="text.46"/>. The resource utilization report provided on Cray systems for a job
provides the total energy (<inline-formula><mml:math id="M40" display="inline"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>) consumed by each application run on <inline-formula><mml:math id="M41" display="inline"><mml:mi>N</mml:mi></mml:math></inline-formula> compute nodes. The total energy (which includes the interconnect)
is then computed using
            <disp-formula id="Ch1.E1" content-type="numbered"><mml:math id="M42" display="block"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mtext>tot</mml:mtext></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac style="display"><mml:mrow><mml:msub><mml:mi>E</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mi>N</mml:mi><mml:mo>/</mml:mo><mml:mn mathvariant="normal">4</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">100</mml:mn><mml:mspace linebreak="nobreak" width="0.33em"/><mml:mi mathvariant="normal">W</mml:mi><mml:mo>×</mml:mo><mml:mi mathvariant="italic">τ</mml:mi></mml:mrow><mml:mn mathvariant="normal">0.95</mml:mn></mml:mfrac></mml:mstyle><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
          where <inline-formula><mml:math id="M43" display="inline"><mml:mi mathvariant="italic">τ</mml:mi></mml:math></inline-formula> is the wall time for the application, the <inline-formula><mml:math id="M44" display="inline"><mml:mrow><mml:mi>N</mml:mi><mml:mo>/</mml:mo><mml:mn mathvariant="normal">4</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">100</mml:mn></mml:mrow></mml:math></inline-formula> W <inline-formula><mml:math id="M45" display="inline"><mml:mo>×</mml:mo></mml:math></inline-formula> <inline-formula><mml:math id="M46" display="inline"><mml:mi mathvariant="italic">τ</mml:mi></mml:math></inline-formula> term
accounts for the 100 W per blade contribution from the Aries interconnect, and the 0.95 on the denominator
adjusts for AC/DC conversion.</p>
</sec>
<sec id="Ch1.S3.SS4">
  <title>Simulation setup and verification</title>

      <?xmltex \floatpos{t}?><fig id="Ch1.F3" specific-use="star"><caption><p id="d1e1102">Evolution of a baroclinic wave in a dry simulation with 47 km grid spacing on day 8 and
10: <bold>(a)</bold> surface pressure and <bold>(b)</bold> temperature on the 850 hPa pressure level (roughly 1.5 km
above sea level).</p></caption>
          <?xmltex \igopts{width=369.885827pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/1665/2018/gmd-11-1665-2018-f03.png"/>

        </fig>

      <p id="d1e1117">When pushing ahead the development of global high-resolution climate models,
there are two complementary pathways. First, one can refine the resolution of existing global
climate models <xref ref-type="bibr" rid="bib1.bibx45" id="paren.47"/>. Second, one may
alternatively try to expand the computational domain of high-resolution
limited-area models towards the global scale <xref ref-type="bibr" rid="bib1.bibx10" id="paren.48"/>.
Here we choose the latter and develop a near-global model from
the limited-area high-resolution model COSMO.</p>
      <p id="d1e1126">We perform near-global simulations for a computational domain that
extends to a latitude band from 80<inline-formula><mml:math id="M47" display="inline"><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup></mml:math></inline-formula> S to 80<inline-formula><mml:math id="M48" display="inline"><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup></mml:math></inline-formula> N, which covers
98.4 % of the surface area of planet Earth.
The simulation is inspired by the test case used by the winner
of the 2016 Gordon Bell Prize <xref ref-type="bibr" rid="bib1.bibx67" id="paren.49"/>.</p>
      <p id="d1e1150">The simulations are based on an idealized baroclinic wave test <xref ref-type="bibr" rid="bib1.bibx33" id="paren.50"/>,
which can be considered a standard benchmark for dynamical cores of atmospheric models. The test
describes the growth of initial disturbances in a dynamically unstable westerly jet stream into finite-amplitude
low- and high-pressure systems. The development includes a rapid transition into a
nonlinear regime, accompanied by the formation of sharp meteorological fronts, which in turn trigger
the formation of complex cloud and precipitation systems.</p>
      <p id="d1e1157">The setup uses a two-dimensional (latitude–height) analytical
description of a hydrostatically balanced atmospheric base state with westerly jet streams below the
tropopause, in both hemispheres. A large-scale local Gaussian perturbation is then applied to this
balanced initial state which triggers the formation of a growing baroclinic
wave in the Northern Hemisphere, evolving over the course of several days
(Fig. <xref ref-type="fig" rid="Ch1.F3"/>). To allow moist processes, the dry initial state is extended with a
moisture profile <xref ref-type="bibr" rid="bib1.bibx51" id="paren.51"/> and the parametrization of cloud-microphysical processes is activated.</p>
      <p id="d1e1165">The numerical problem is discretized on a latitude–longitude grid with up to 36 000 <inline-formula><mml:math id="M49" display="inline"><mml:mo>×</mml:mo></mml:math></inline-formula> 16 001
horizontal grid points for the 930 m simulation. In the zonal direction the domain is periodic
and at 80<inline-formula><mml:math id="M50" display="inline"><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup></mml:math></inline-formula> north/south
confined by boundary conditions, relaxing the evolving solution against the initial conditions in a
500 km wide zone. The vertical direction is discretized using 60 stretched model levels, spanning
from the surface to the model top at 40 km. The respective layer thickness widens from 20 m at the
surface to 1.5 km near the domain top.</p>

      <?xmltex \floatpos{t}?><fig id="Ch1.F4" specific-use="star"><caption><p id="d1e1186">Output of the baroclinic wave (day 10): (<bold>a</bold>, <bold>b</bold>) dry simulation with
1.9 km grid spacing, (<bold>c</bold>, <bold>d</bold>, <bold>e</bold>) moist simulation with 0.93 km grid spacing, (<bold>a</bold>, <bold>c</bold>) surface pressure,
(<bold>b</bold>, <bold>d</bold>) temperature on the
850 hPa pressure level, and (<bold>e</bold>) precipitation.</p></caption>
          <?xmltex \igopts{width=412.564961pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/1665/2018/gmd-11-1665-2018-f04.png"/>

        </fig>

      <p id="d1e1226">For the verification against previous dry simulations, a simulation at 47 km grid spacing is used. The evolution of
the baroclinic wave (Fig. <xref ref-type="fig" rid="Ch1.F3"/>) very closely follows the solution originally found by
<xref ref-type="bibr" rid="bib1.bibx33" id="text.52"><named-content content-type="post">see their Fig. 5</named-content></xref>.
At day 8 of the simulation, three low-pressure systems with frontal systems have formed, and at day 10 wave breaking is evident. At this
time, the surface temperature field shows cutoff warm-core cyclonic vortices.</p>
      <p id="d1e1236">The evolution in the moist and the dry simulation at very high resolution is shown in
Fig. <xref ref-type="fig" rid="Ch1.F4"/>. The high-resolution simulations reveal the onset of a secondary
(likely barotropic) instability along the front and the formation of small-scale warm-core vortices
with a spacing of up to 200–300 km. The
basic structure of these vortices is already present in the dry simulation, but they exhibit considerable
intensification and precipitation in the moist case. The formation of a large series of secondary vortices is
sometimes observed in maritime cases of cyclogenesis <xref ref-type="bibr" rid="bib1.bibx22 bib1.bibx52" id="paren.53"/> but appears to be a
rather rare phenomena. However, it appears that cases with one or a few secondary vortices are not
uncommon and they may even be associated with severe weather <xref ref-type="bibr" rid="bib1.bibx43" id="paren.54"/>.</p>
      <p id="d1e1248">The resulting cloud pattern is dominated by the comma-shaped precipitating cloud that forms along the
cold and occluded fronts of the parent system (Fig. <xref ref-type="fig" rid="Ch1.F1"/>). The precipitation pattern is
associated with stratiform precipitation in the head of the cloud and small patches with precipitation rates
exceeding 5 mm h<inline-formula><mml:math id="M51" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> in their tail, stemming from small embedded convective cells. Looking closely at the precipitation field
(Fig. <xref ref-type="fig" rid="Ch1.F4"/>e), it can be seen that the secondary vortices are colocated
with small patches of enhanced precipitation.</p>
</sec>
</sec>
<sec id="Ch1.S4">
  <title>Efficiency metric</title>
      <p id="d1e1274">In the past, the prevalent metric to
measure the performance of an application was the number of floating point
operations executed per second (flop s<inline-formula><mml:math id="M52" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>).
It used to pay off to minimize the number of required floating point operations in an
algorithm and to implement the computing system in such a way that a
code would execute many such operations per second. Thus, it made
sense to assess application performance with the flop s<inline-formula><mml:math id="M53" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> metric. However,
the world of supercomputing has changed.
Floating point operations are now relatively cheap. They cost about a factor of 1000 less
if measured in terms of energy needed to move operands between memory
and registers, and they execute several hundred times faster compared to the
latency of memory operations. Thus, algorithmic optimization today has
to focus on minimizing data movement, and it may even pay off<?pagebreak page1671?> to
recompute certain quantities if this avoids data movements. In fact, a
significant part of the improvements in time to solution in the
refactored COSMO code are due to the recomputation of variables that were
previously stored in memory – the original code was written for vector
supercomputers in the 1990s. On today's architectures, it may even pay
off to replace a floating point minimal sparse algorithm with a
block-sparse algorithm, into which many trivial zero operations have
been introduced <xref ref-type="bibr" rid="bib1.bibx32" id="paren.55"/>. Using the flop s<inline-formula><mml:math id="M54" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> metric
to characterize application performance could be very misleading in these
cases.</p>
      <p id="d1e1316">A popular method to find performance bottlenecks of both compute- and
memory-bound
applications is the roofline model <xref ref-type="bibr" rid="bib1.bibx66" id="paren.56"/>. However,
assessing performance of an application simply in terms of sustained
memory bandwidth, which is measured in bytes per second, would be
equally deceptive. For example, by storing many variables in memory,
the original implementation of COSMO introduces an abundance of memory
movements that boost the sustained memory bandwidth but are inefficient
on modern processor architectures, since these movements cost much more
than the recomputation of these variables.</p>
      <p id="d1e1322">Furthermore, full-bandwidth utilization on modern accelerator
architectures, such as the GPUs used here, requires very specific
conditions. Optimizing for bandwidth utilization can lead the
application developer down the wrong path because unaligned, strided, or
random accesses can be intrinsic to an underlying algorithm and
severely impact the bandwidth <xref ref-type="bibr" rid="bib1.bibx39" id="paren.57"/>. Optimizations to improve
alignment or reduce the randomness of memory accesses may introduce
unnecessary memory operations that could be detrimental to
time to solution or energy efficiency.</p>
      <p id="d1e1328">Thus, in order to properly assess the quality of our optimizations, one
needs to directly consider
data movement. Here we propose a method how this can be done in practice for the COSMO
dynamical core and present
the resulting MUE metric.</p>
      <p id="d1e1332">Our proposal is motivated by the full-scale COSMO runs we perform here,
where 74 % of the total time is spent in local stencil computations.
The dependencies between stencils in complex
stencil programs can be optimized with various inlining and
unrolling techniques <xref ref-type="bibr" rid="bib1.bibx27" id="paren.58"/>. Thus, to be efficient, one needs
to achieve maximum spatial and temporal data reuse to minimize the
number of data movement operations and perform them at the highest bandwidth.</p>
      <p id="d1e1338">In order to assess the efficiency in memory usage of an implementation on a particular
machine, one needs to compare the actual number of data transfers executed,<fn id="Ch1.Footn7"><p id="d1e1341">Here, we define data transfers as load/stores from system memory.</p></fn> which we denote with <inline-formula><mml:math id="M55" display="inline"><mml:mi>D</mml:mi></mml:math></inline-formula>,
with the necessary data transfers <inline-formula><mml:math id="M56" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula> of the algorithm.
<inline-formula><mml:math id="M57" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula> is the theoretical lower bound <xref ref-type="bibr" rid="bib1.bibx30" id="paren.59"/> of the number of memory accesses required to implement the
numerical algorithm.</p>
      <p id="d1e1369">The MUE can be
intuitively interpreted as how well
the code is optimized both for data locality and bandwidth utilization; i.e., if MUE <inline-formula><mml:math id="M58" display="inline"><mml:mo>=</mml:mo></mml:math></inline-formula> 1, the implementation reaches the memory movement lower bound
of the
algorithm
<italic>and</italic> performs all memory transfers with maximum bandwidth. Formally,
          <disp-formula id="Ch1.E2" content-type="numbered"><mml:math id="M59" display="block"><mml:mrow><mml:mtext>MUE</mml:mtext><mml:mo>=</mml:mo><mml:mtext>I/O efficiency</mml:mtext><mml:mo>⋅</mml:mo><mml:mtext>BW efficiency</mml:mtext><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac style="display"><mml:mi>Q</mml:mi><mml:mi>D</mml:mi></mml:mfrac></mml:mstyle><mml:mo>⋅</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac style="display"><mml:mi>B</mml:mi><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo mathvariant="normal" stretchy="false">^</mml:mo></mml:mover></mml:mfrac></mml:mstyle><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
        where <inline-formula><mml:math id="M60" display="inline"><mml:mi>B</mml:mi></mml:math></inline-formula> and <inline-formula><mml:math id="M61" display="inline"><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo mathvariant="normal" stretchy="false">^</mml:mo></mml:mover></mml:math></inline-formula> represent the bandwidth achieved by an implementation
and maximum achievable bandwidth, respectively.</p>
      <p id="d1e1436">In order to compute the MUE for COSMO, we developed a performance model that
combines the theoretical model from <xref ref-type="bibr" rid="bib1.bibx30" id="text.60"/>, hypergraph
properties, and graph partitioning techniques to estimate
the necessary data transfer, <inline-formula><mml:math id="M62" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula>, from the CDAG information of COSMO. By
partitioning the CDAG into subcomputations that satisfy certain conditions
imposed by the architecture, this
model
determines the theoretical minimum amount of memory transfers by maximizing the data
locality inside the partitions. To approximate the number of actual transfers
<inline-formula><mml:math id="M63" display="inline"><mml:mi>D</mml:mi></mml:math></inline-formula>, we use the same technique to evaluate the quality of current COSMO
partitioning. The values <inline-formula><mml:math id="M64" display="inline"><mml:mi>B</mml:mi></mml:math></inline-formula> and <inline-formula><mml:math id="M65" display="inline"><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo mathvariant="normal" stretchy="false">^</mml:mo></mml:mover></mml:math></inline-formula> were measured empirically – the
former by profiling our application and the latter by a set of micro-benchmarks.</p>
      <p id="d1e1473">The details of how to determine the MUE for COSMO are given in
Appendix <xref ref-type="sec" rid="App1.Ch1.S1"/>. The MUE metric cannot be used to compare
different algorithms but is a measure of the
efficiency of an implementation of a particular algorithm on a particular machine, i.e., how much data locality is
preserved and what the achieved bandwidth is. It thus complements other metrics such
as SYPD and may complement popular metrics such as flop s<inline-formula><mml:math id="M66" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> or memory bandwidth. As compared to the
frequently applied roofline
model, the MUE metric also includes the schedule of operations, not simply the efficient use of the memory
subsystem. It is thus a stronger but also more complex metric.</p>
</sec>
<?pagebreak page1672?><sec id="Ch1.S5">
  <title>Performance results</title>
      <p id="d1e1497">To establish a performance baseline for global kilometer-scale simulations, we here present a summary of the key performance metrics
for the simulations at 930 m, 1.9 km, and 47 km grid spacing, as well
as a study of weak and strong scalability.</p>
<sec id="Ch1.S5.SS1">
  <title>Weak scalability</title>

      <?xmltex \floatpos{t}?><fig id="Ch1.F5"><caption><p id="d1e1507">Weak scalability on the hybrid P100 Piz Daint nodes, per COSMO time step of the dry simulation.</p></caption>
          <?xmltex \igopts{width=241.848425pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/1665/2018/gmd-11-1665-2018-f05.pdf"/>

        </fig>

      <p id="d1e1516">Until this study the GPU version of COSMO had only been scaled up to 1000 nodes of the Piz Daint
supercomputer <xref ref-type="bibr" rid="bib1.bibx23" id="paren.61"/>, while the full machine – at the time of the experiment – provides
4932 nodes.
The
scaling experiments (weak and strong) were performed with the dry simulation setup.
Including a cloud microphysics parametrization, as used in the moist simulation
(Fig. <xref ref-type="fig" rid="Ch1.F4"/>), increases time to solution by about 10 %. Since microphysics
does not contain any inter-node communication and is purely local to a single column of grid points, we do not expect
an adverse impact on either weak or strong scalability.</p>
      <p id="d1e1524">Figure <xref ref-type="fig" rid="Ch1.F5"/> shows weak scaling for three per-node domain
sizes ranging from <inline-formula><mml:math id="M67" display="inline"><mml:mrow><mml:mn mathvariant="normal">128</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">128</mml:mn></mml:mrow></mml:math></inline-formula> to <inline-formula><mml:math id="M68" display="inline"><mml:mrow><mml:mn mathvariant="normal">256</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">256</mml:mn></mml:mrow></mml:math></inline-formula> grid points in the horizontal, while
keeping the size in the  vertical direction fixed at 60 grid points. In
comparison, on 4888 nodes, the high-resolution simulations at 930 m and 1.9 km
horizontal grid spacing correspond to a domain size per node of about
<inline-formula><mml:math id="M69" display="inline"><mml:mrow><mml:mn mathvariant="normal">346</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">340</mml:mn></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M70" display="inline"><mml:mrow><mml:mn mathvariant="normal">173</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">170</mml:mn></mml:mrow></mml:math></inline-formula>.<fn id="Ch1.Footn8"><p id="d1e1578">The exact domain size of these
simulations is slightly different on each node due to the domain decomposition.</p></fn>
The model shows excellent weak scalability properties up to the full machine, which can at least
partially be explained by the nearest-neighbor halo-exchange pattern.
This property reduces the complexity
of COSMO's scalability on Piz Daint to the strong scaling behavior of the code.
Essentially, the number of grid points per node determines the achievable
time to solution of a given problem.</p>
</sec>
<sec id="Ch1.S5.SS2">
  <title>Strong scalability</title>

      <?xmltex \floatpos{t}?><fig id="Ch1.F6"><caption><p id="d1e1590">Strong scalability on Piz Daint: on P100 GPUs (filled symbols) and on Haswell CPUs using 12 MPI ranks per node (empty symbols).</p></caption>
          <?xmltex \igopts{width=241.848425pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/1665/2018/gmd-11-1665-2018-f06.pdf"/>

        </fig>

      <p id="d1e1599">From earlier scaling experiments of the STELLA library <xref ref-type="bibr" rid="bib1.bibx28" id="paren.62"/> and the
GPU version of COSMO <xref ref-type="bibr" rid="bib1.bibx23 bib1.bibx40" id="paren.63"/>, it is known that, for
experiments in double precision on Tesla K20x, linear scaling is achieved as
long as the number of grid points per node exceeds about <inline-formula><mml:math id="M71" display="inline"><mml:mrow><mml:mn mathvariant="normal">64</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">64</mml:mn></mml:mrow></mml:math></inline-formula> to
<inline-formula><mml:math id="M72" display="inline"><mml:mrow><mml:mn mathvariant="normal">128</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">128</mml:mn></mml:mrow></mml:math></inline-formula> grid points per horizontal plane. In comparison, single-precision measurements on Tesla P100 already start to saturate at a horizontal
domain size of about <inline-formula><mml:math id="M73" display="inline"><mml:mrow><mml:mn mathvariant="normal">200</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">200</mml:mn></mml:mrow></mml:math></inline-formula> grid points per node
(Fig. <xref ref-type="fig" rid="Ch1.F6"/>),<fn id="Ch1.Footn9"><p id="d1e1647">COSMO supports
two floating point formats to store numbers – double and single precision – which can be chosen at compile time.</p></fn> corresponding to about 32 nodes for a 19 km
setup and about 1000 nodes for a 3.7 km setup. Since with the
930 m and 1.9 km setup we are already in 930 m or close to 1.9 km, the
linear scaling regime on the full machine, we here chose a coarser horizontal
grid spacing of 3.7 and 19 km. The lower limit on the number of nodes is given by the GPU memory of 16 GB.
In addition to the GPU benchmarks (filled symbols), we measured the performance with the CPU version of COSMO (empty
symbols), using 12 MPI (Message Passing Interface) ranks per CPU, i.e., one MPI rank per Haswell
core.<fn id="Ch1.Footn10"><p id="d1e1651">The CPU measurements were performed on the hybrid partition of Piz
Daint as well, since the multi-core partition is much smaller.</p></fn> Exceeding 1000 nodes
(<inline-formula><mml:math id="M74" display="inline"><mml:mrow><mml:mn mathvariant="normal">38</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">38</mml:mn></mml:mrow></mml:math></inline-formula> grid points per node) execution on CPUs yields a shorter time to solution
than on GPUs.</p>
</sec>
<?pagebreak page1673?><sec id="Ch1.S5.SS3">
  <title>Time to solution</title>

<?xmltex \floatpos{t}?><table-wrap id="Ch1.T1"><caption><p id="d1e1676">Time compression (SYPD) and energy cost (MWh SY<inline-formula><mml:math id="M75" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>) for three moist simulations: at 930 m
grid spacing obtained with a full 10-day simulation, at 1.9 km from
1000 steps, and at 47 km  from 100 steps.</p></caption><oasis:table frame="topbot"><oasis:tgroup cols="6">
     <oasis:colspec colnum="1" colname="col1" align="left"/>
     <oasis:colspec colnum="2" colname="col2" align="right"/>
     <oasis:colspec colnum="3" colname="col3" align="right"/>
     <oasis:colspec colnum="4" colname="col4" align="right"/>
     <oasis:colspec colnum="5" colname="col5" align="right"/>
     <oasis:colspec colnum="6" colname="col6" align="right"/>
     <oasis:thead>
       <oasis:row>
         <oasis:entry colname="col1"/>
         <oasis:entry colname="col2"/>
         <oasis:entry colname="col3"><inline-formula><mml:math id="M76" display="inline"><mml:mrow><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:math></inline-formula></oasis:entry>
         <oasis:entry rowsep="1" colname="col4"/>
         <oasis:entry rowsep="1" colname="col5">MWh</oasis:entry>
         <oasis:entry rowsep="1" colname="col6"/>
       </oasis:row>
       <oasis:row rowsep="1">
         <oasis:entry colname="col1"><inline-formula><mml:math id="M77" display="inline"><mml:mrow><mml:mfenced close="〉" open="〈"><mml:mrow><mml:mi mathvariant="normal">Δ</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:mfenced></mml:mrow></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col2">no. of nodes</oasis:entry>
         <oasis:entry colname="col3">s</oasis:entry>
         <oasis:entry colname="col4">SYPD</oasis:entry>
         <oasis:entry colname="col5">SY<inline-formula><mml:math id="M78" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col6">grid points</oasis:entry>
       </oasis:row>
     </oasis:thead>
     <oasis:tbody>
       <oasis:row>
         <oasis:entry colname="col1">930 m</oasis:entry>
         <oasis:entry colname="col2">4888</oasis:entry>
         <oasis:entry colname="col3">6</oasis:entry>
         <oasis:entry colname="col4">0.043</oasis:entry>
         <oasis:entry colname="col5"><inline-formula><mml:math id="M79" display="inline"><mml:mn mathvariant="normal">596</mml:mn></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col6"><inline-formula><mml:math id="M80" display="inline"><mml:mrow><mml:mn mathvariant="normal">3.46</mml:mn><mml:mo>×</mml:mo><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">10</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula></oasis:entry>
       </oasis:row>
       <oasis:row>
         <oasis:entry colname="col1">1.9 km</oasis:entry>
         <oasis:entry colname="col2">4888</oasis:entry>
         <oasis:entry colname="col3">12</oasis:entry>
         <oasis:entry colname="col4">0.23</oasis:entry>
         <oasis:entry colname="col5"><inline-formula><mml:math id="M81" display="inline"><mml:mn mathvariant="normal">97.8</mml:mn></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col6"><inline-formula><mml:math id="M82" display="inline"><mml:mrow><mml:mn mathvariant="normal">8.64</mml:mn><mml:mo>×</mml:mo><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">9</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula></oasis:entry>
       </oasis:row>
       <oasis:row>
         <oasis:entry colname="col1">47 km</oasis:entry>
         <oasis:entry colname="col2">18</oasis:entry>
         <oasis:entry colname="col3">300</oasis:entry>
         <oasis:entry colname="col4">9.6</oasis:entry>
         <oasis:entry colname="col5"><inline-formula><mml:math id="M83" display="inline"><mml:mn mathvariant="normal">0.099</mml:mn></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col6"><inline-formula><mml:math id="M84" display="inline"><mml:mrow><mml:mn mathvariant="normal">1.39</mml:mn><mml:mo>×</mml:mo><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">7</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula></oasis:entry>
       </oasis:row>
     </oasis:tbody>
   </oasis:tgroup></oasis:table></table-wrap>

      <p id="d1e1904">On 4888 nodes, a 10-day long moist simulation at 930 m  grid spacing required a
wall time of 15.3 h at a rate of 0.043 SYPD
(Table <xref ref-type="table" rid="Ch1.T1"/>), including a disk I/O load of five 3-D fields and seven 2-D fields,<fn id="Ch1.Footn11"><p id="d1e1909">The standard
I/O routines of COSMO require global fields on a single node, which typically do
not provide enough storage to hold a global field. To circumvent this
limitation, a binary I/O mode, allowing each node to write its output to the
file system, was implemented.</p></fn> written
periodically every 12<inline-formula><mml:math id="M85" display="inline"><mml:mspace width="0.125em" linebreak="nobreak"/></mml:math></inline-formula>h of the simulation. Short benchmark simulations at
1.9 and 47 km, integrated for 1000 and 100 time steps, respectively, yield 0.23
and 9.6 SYPD. As mentioned earlier, the minimum value required
for AMIP-type simulations is 0.2–0.3 SYPD. While for the 47 km setup scalability already
starts to saturate, even with the 18 nodes reported
(Table <xref ref-type="table" rid="Ch1.T1"/>), the high-resolution simulations, with an
approximate per-node domain size of <inline-formula><mml:math id="M86" display="inline"><mml:mrow><mml:mn mathvariant="normal">346</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">340</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">60</mml:mn></mml:mrow></mml:math></inline-formula> and
<inline-formula><mml:math id="M87" display="inline"><mml:mrow><mml:mn mathvariant="normal">173</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">170</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">60</mml:mn></mml:mrow></mml:math></inline-formula>, are still in a regime of good
scalability. We have also conducted a 1.9 km simulation with a full set of physical parametrizations
switched on, and this increases the time to solution by 27 % relative to the moist simulations presented here.</p>
      <p id="d1e1954">In conclusion, the results show that AMIP-type simulations at horizontal grid spacings of 1.9 km
using a fully fledged atmospheric model are already feasible on Europe's highest-ranking supercomputer.
In order to reach resolutions of 1 km, a further reduction of time to solution of at least a factor of 5 is
required.</p>
      <p id="d1e1957">In the remainder of this section, we attempt a comparison of our results at 1.9 km against the performance achieved by <xref ref-type="bibr" rid="bib1.bibx67" id="text.64"/>
in their 2 km simulation (as their information for some of the other simulations is incomplete). They argue
that for an implicit solver the time step can be kept at 240 s independent of the grid spacing and report
(see their Figs. 7 and 9) values of 0.57 SYPD
for a grid spacing of 2 km, on the full TaihuLight system.
As explained in Sect. <xref ref-type="sec" rid="Ch1.S2"/>,
such large time steps are not feasible for global climate simulations resolving convective clouds
(even when using implicit solvers), and a maximum time step of 40–80 s would very likely be needed;
this would decrease their SYPD by a factor of 3 to 6. Furthermore, their simulation covers only 32 % of the Earth's
surface (18<inline-formula><mml:math id="M88" display="inline"><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup></mml:math></inline-formula> N to 72<inline-formula><mml:math id="M89" display="inline"><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup></mml:math></inline-formula>) but uses twice as many levels; this would further reduce their SYPD by a factor of 1.5.
Thus we estimate that the simulation of <xref ref-type="bibr" rid="bib1.bibx67" id="text.65"/> at 2 km would yield 0.124 to 0.064 SYPD when accounting for
these differences. In comparison, our simulation at 1.9 km yields 0.23 SYPD; i.e., it is faster by at least a factor of 2.5. Note that
this estimate does not account for additional simplifications in their study
(neglect of microphysical processes, spherical shape of the planet, and topography).
In summary, while a direct comparison with their results is difficult,
we<?pagebreak page1674?> argue that our results can be used to set a realistic
baseline for production-level GCM performance results and represent an improvement by at least a factor of 2 with respect
to previous results.</p>
</sec>
<sec id="Ch1.S5.SS4">
  <title>Energy to solution</title>
      <p id="d1e1993">Based on the power measurement (see Sect. <xref ref-type="sec" rid="Ch1.S3.SS3"/>) we now provide the energy cost of full-scale
simulations (Table <xref ref-type="table" rid="Ch1.T1"/>)
using the energy cost unit MWh per simulation year (MWh/SY).
The 10-day long simulation at 930 m grid spacing running on 4888 nodes requires <inline-formula><mml:math id="M90" display="inline"><mml:mn mathvariant="normal">596</mml:mn></mml:math></inline-formula> MWh SY<inline-formula><mml:math id="M91" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>,
while the cost of the simulation at 1.9 km on 4888 nodes is <inline-formula><mml:math id="M92" display="inline"><mml:mn mathvariant="normal">97.8</mml:mn></mml:math></inline-formula> MWh SY<inline-formula><mml:math id="M93" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>.
For comparison, the coarse resolution at 47 km simulation on a reduced number of nodes (18) requires only <inline-formula><mml:math id="M94" display="inline"><mml:mn mathvariant="normal">0.01</mml:mn></mml:math></inline-formula> MWh SY<inline-formula><mml:math id="M95" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>.</p>
      <p id="d1e2058">A 30-year AMIP-type simulation (with full physics) at a horizontal grid spacing of 1 km on the Piz Daint
system would take 900 days to complete, resulting in an energy cost of approximately 22 GWh – which
approximately corresponds to the consumption of 6500 households during 1 year.</p>
      <p id="d1e2061">Again, we attempt a comparison with the simulations performed by
<xref ref-type="bibr" rid="bib1.bibx67" id="text.66"/>. The Piz Daint system reports a peak power draw of 2052 kW
when running the high-performance LINPACK (HPL) benchmark. The sustained power draw when running the
930 m simulation amounted to 1059.7 kW, thus 52 % of the HPL value. The
TaihuLight system reports a sustained power draw for the HPL benchmark of
15 371 kW <xref ref-type="bibr" rid="bib1.bibx62" id="paren.67"/>. While <xref ref-type="bibr" rid="bib1.bibx67" id="text.68"/> do not report power
consumption of their simulations, we expect the simulations on Piz Daint to
be at least 5 times more power efficient, even when assuming similar achieved
SYPD (see above).<fn id="Ch1.Footn12"><p id="d1e2073">We conservatively assume an application power draw
of at least 35 % on TaihuLight.</p></fn></p>
</sec>
<sec id="Ch1.S5.SS5">
  <title>Simulation efficiency</title>

<?xmltex \floatpos{t}?><table-wrap id="Ch1.T2"><caption><p id="d1e2085">Data transfer cost estimations of the dynamical core for a theoretical
lower bound (METIS),
dynamical core implementation using the STELLA library (COSMO), and non-optimized
dynamical core implementation (no-merging).</p></caption><oasis:table frame="topbot"><oasis:tgroup cols="4">
     <oasis:colspec colnum="1" colname="col1" align="left"/>
     <oasis:colspec colnum="2" colname="col2" align="right"/>
     <oasis:colspec colnum="3" colname="col3" align="right"/>
     <oasis:colspec colnum="4" colname="col4" align="right"/>
     <oasis:thead>
       <oasis:row rowsep="1">
         <oasis:entry colname="col1">Level</oasis:entry>
         <oasis:entry colname="col2">METIS <inline-formula><mml:math id="M96" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col3">COSMO <inline-formula><mml:math id="M97" display="inline"><mml:mi>D</mml:mi></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col4">No-opt <inline-formula><mml:math id="M98" display="inline"><mml:mover accent="true"><mml:mi>D</mml:mi><mml:mo mathvariant="normal" stretchy="false">^</mml:mo></mml:mover></mml:math></inline-formula></oasis:entry>
       </oasis:row>
     </oasis:thead>
     <oasis:tbody>
       <oasis:row>
         <oasis:entry colname="col1">Registers</oasis:entry>
         <oasis:entry colname="col2"><inline-formula><mml:math id="M99" display="inline"><mml:mrow><mml:mn mathvariant="normal">1.51</mml:mn><mml:mo>×</mml:mo><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">9</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col3"><inline-formula><mml:math id="M100" display="inline"><mml:mrow><mml:mn mathvariant="normal">1.72</mml:mn><mml:mo>×</mml:mo><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">9</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col4"><inline-formula><mml:math id="M101" display="inline"><mml:mrow><mml:mn mathvariant="normal">2.6</mml:mn><mml:mo>×</mml:mo><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">9</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula></oasis:entry>
       </oasis:row>
       <oasis:row>
         <oasis:entry colname="col1">Shared memory</oasis:entry>
         <oasis:entry colname="col2">64 800</oasis:entry>
         <oasis:entry colname="col3">107 600</oasis:entry>
         <oasis:entry colname="col4">229 120</oasis:entry>
       </oasis:row>
       <oasis:row>
         <oasis:entry colname="col1">L2 cache</oasis:entry>
         <oasis:entry colname="col2">1023</oasis:entry>
         <oasis:entry colname="col3">1160</oasis:entry>
         <oasis:entry colname="col4">2341</oasis:entry>
       </oasis:row>
     </oasis:tbody>
   </oasis:tgroup></oasis:table></table-wrap>

      <p id="d1e2228">In view of the energy consumption, it is paramount to consider the efficiency of the simulations
executed. We do this here by considering the memory usage efficiency metric
introduced in Sect. <xref ref-type="sec" rid="Ch1.S4"/>.</p>
      <p id="d1e2233">To estimate a solution for the optimization problem described in Appendix <xref ref-type="sec" rid="App1.Ch1.S1"/>,
Eq. (<xref ref-type="disp-formula" rid="App1.Ch1.E2"/>), we use the METIS library <xref ref-type="bibr" rid="bib1.bibx34" id="paren.69"/>. The
results are presented in Table <xref ref-type="table" rid="Ch1.T2"/>. The
METIS <inline-formula><mml:math id="M102" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula> column is the approximation of the lower bound <inline-formula><mml:math id="M103" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula> obtained
from the
performance model using the METIS library. The COSMO
<inline-formula><mml:math id="M104" display="inline"><mml:mi>D</mml:mi></mml:math></inline-formula> column is the evaluation of current COSMO partitioning in our model.
The no-opt <inline-formula><mml:math id="M105" display="inline"><mml:mover accent="true"><mml:mi>D</mml:mi><mml:mo mathvariant="normal" stretchy="false">^</mml:mo></mml:mover></mml:math></inline-formula> column shows the
amount of data transfers of COSMO if no data locality optimization techniques
are applied, like in the
original Fortran version of the code.
Since the original CDAG is too large for the minimization problem of the performance model,
three different simplified versions of the CDAG, which focus on the accesses to three
different layers of the memory hierarchy of the GPU, are studied: registers, shared memory, and L2 cache.</p>
      <p id="d1e2277">The model shows how efficient COSMO is in terms of data transfers – it
generates only 14,
66, and 13 % more data transfers than the lower bound in the register, shared memory, and L2 cache layers, respectively.
The sophisticated data locality optimization techniques of the STELLA implementation of the dynamical
core of COSMO result
in very good data reuse. On the P100, all memory accesses
to/from DRAM go through the L2 unit. Therefore, we focus on the
efficiency of this unit such that
            <disp-formula id="Ch1.E3" content-type="numbered"><mml:math id="M106" display="block"><mml:mrow><mml:mtext>MUE</mml:mtext><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac style="display"><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mi>L</mml:mi><mml:mn mathvariant="normal">2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>L</mml:mi><mml:mn mathvariant="normal">2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mstyle><mml:mo>⋅</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac style="display"><mml:mi>B</mml:mi><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo stretchy="false" mathvariant="normal">^</mml:mo></mml:mover></mml:mfrac></mml:mstyle><mml:mo>=</mml:mo><mml:mn mathvariant="normal">0.88</mml:mn><mml:mo>⋅</mml:mo><mml:mn mathvariant="normal">0.76</mml:mn><mml:mo>=</mml:mo><mml:mn mathvariant="normal">0.67</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
          where <inline-formula><mml:math id="M107" display="inline"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>L</mml:mi><mml:mn mathvariant="normal">2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M108" display="inline"><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mi>L</mml:mi><mml:mn mathvariant="normal">2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> stand for estimated number of main memory
operations and its lower bound, respectively.</p>

<?xmltex \floatpos{t}?><table-wrap id="Ch1.T3"><caption><p id="d1e2368">Performance model verification results. Measured dynamical core time step execution time for the STELLA
implementation optimized for data locality and the non-optimized implementation compared against the
corresponding MUE metric.</p></caption><oasis:table frame="topbot"><oasis:tgroup cols="4">
     <oasis:colspec colnum="1" colname="col1" align="left"/>
     <oasis:colspec colnum="2" colname="col2" align="right"/>
     <oasis:colspec colnum="3" colname="col3" align="right"/>
     <oasis:colspec colnum="4" colname="col4" align="right"/>
     <oasis:thead>
       <oasis:row rowsep="1">
         <oasis:entry colname="col1">Metric</oasis:entry>
         <oasis:entry colname="col2">Optimized</oasis:entry>
         <oasis:entry colname="col3">Not optimized</oasis:entry>
         <oasis:entry colname="col4">Ratio</oasis:entry>
       </oasis:row>
     </oasis:thead>
     <oasis:tbody>
       <oasis:row>
         <oasis:entry colname="col1">Time per step</oasis:entry>
         <oasis:entry colname="col2">0.16 s</oasis:entry>
         <oasis:entry colname="col3">0.25 s</oasis:entry>
         <oasis:entry colname="col4">0.64</oasis:entry>
       </oasis:row>
       <oasis:row>
         <oasis:entry colname="col1">Estimated MUE</oasis:entry>
         <oasis:entry colname="col2">0.67</oasis:entry>
         <oasis:entry colname="col3">0.44</oasis:entry>
         <oasis:entry colname="col4">0.65</oasis:entry>
       </oasis:row>
     </oasis:tbody>
   </oasis:tgroup></oasis:table></table-wrap>

      <p id="d1e2436">The model also can estimate the efficiency of our optimizations. Assuming that
we can reach the peak achievable bandwidth <inline-formula><mml:math id="M109" display="inline"><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo stretchy="false" mathvariant="normal">^</mml:mo></mml:mover></mml:math></inline-formula> if we perform no data
locality optimizations (<inline-formula><mml:math id="M110" display="inline"><mml:mover accent="true"><mml:mi>D</mml:mi><mml:mo mathvariant="normal" stretchy="false">^</mml:mo></mml:mover></mml:math></inline-formula>), then
            <disp-formula id="Ch1.E4" content-type="numbered"><mml:math id="M111" display="block"><mml:mrow><mml:msub><mml:mtext>MUE</mml:mtext><mml:mtext>no_opt</mml:mtext></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac style="display"><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:mi>L</mml:mi><mml:mn mathvariant="normal">2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>D</mml:mi><mml:mo mathvariant="normal" stretchy="false">^</mml:mo></mml:mover><mml:mrow><mml:mi>L</mml:mi><mml:mn mathvariant="normal">2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mstyle><mml:mo>⋅</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac style="display"><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo stretchy="false" mathvariant="normal">^</mml:mo></mml:mover><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo stretchy="false" mathvariant="normal">^</mml:mo></mml:mover></mml:mfrac></mml:mstyle><mml:mo>=</mml:mo><mml:mn mathvariant="normal">0.44</mml:mn><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula></p>
      <p id="d1e2514">It can be seen that
            <disp-formula id="Ch1.E5" content-type="numbered"><mml:math id="M112" display="block"><mml:mrow><mml:mstyle displaystyle="true"><mml:mfrac style="display"><mml:mtext>MUE</mml:mtext><mml:mrow><mml:msub><mml:mtext>MUE</mml:mtext><mml:mrow><mml:mtext>no</mml:mtext><mml:mi mathvariant="italic">_</mml:mi><mml:mtext>opt</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mstyle><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:mfrac style="display"><mml:mn mathvariant="normal">0.67</mml:mn><mml:mn mathvariant="normal">0.44</mml:mn></mml:mfrac></mml:mstyle><mml:mo>=</mml:mo><mml:mn mathvariant="normal">1.52</mml:mn><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula></p>
      <p id="d1e2551">This result shows the importance of data locality optimizations – the optimized
implementation is more than 50 % faster than a potential version that achieves
peak bandwidth while using no data locality techniques.</p>
      <?pagebreak page1675?><p id="d1e2554">To validate the model results, we have conducted single-node runs with and
without data locality optimizations. Bandwidth measurements of the non-optimized
version are very close to maximum achievable bandwidth (not shown). The fact that the two ratios in the third
column of Table <xref ref-type="table" rid="Ch1.T3"/> agree is testimony to the high precision of
the performance model.</p>
</sec>
</sec>
<sec id="Ch1.S6" sec-type="conclusions">
  <title>Conclusions</title>
      <p id="d1e2566">The work presented here sets a new baseline for fully fledged
kilometer-scale climate simulations on a global scale. Our implementation of
the COSMO model that is used for production-level numerical weather
predictions at MeteoSwiss has been scaled to the full system on 4888 nodes of Piz Daint,
a GPU-accelerated Cray XC50 supercomputer at the Swiss National Supercomputing
Centre (CSCS). The dynamical core has been fully rewritten in C++ using a DSL
that abstracts the hardware architecture for the stencil algorithmic
motifs and enables sophisticated tuning of data movements.
Optimized back ends are available for multi-core processors with OpenMP, GPU
accelerators with CUDA, and for performance analysis.</p>
      <p id="d1e2569">The code shows excellent strong scaling up to the full machine size when
running at a grid spacing of 4 km and below, on both the P100 GPU
accelerators and the Haswell CPU. For smaller problems, e.g., at a coarser
grid spacing of 47 km, the GPUs run out of parallelism and strong
scalability is limited to about 100 nodes, while the same problem
continues to scale on multi-core processors to 1000 nodes.
Weak scalability is optimal for the full size of the machine.
Overall, performance is significantly better on GPUs as compared to CPUs for the
high-resolution simulations.</p>
      <p id="d1e2572">The simulations performed here are based on the idealized baroclinic
wave test <xref ref-type="bibr" rid="bib1.bibx33" id="paren.70"/>, which is part of the standard procedure to test global
atmospheric models. Our results are validated against the original
solution published in this paper.</p>
      <p id="d1e2578">We measured time to solution in terms of simulated years per wall clock
day (SYPD) for a near-global simulation on a latitude band from
80<inline-formula><mml:math id="M113" display="inline"><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup></mml:math></inline-formula> S to 80<inline-formula><mml:math id="M114" display="inline"><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup></mml:math></inline-formula> N that covers 98.4 % of planet Earth's surface. Running on 4888 P100 GPUs of Piz Daint,
currently Europe's largest supercomputer, we measured 0.23 SYPD at a 1.9 km
grid spacing. This performance is adequate for numerical weather predictions and
10-year scale climate studies. Simulations at this resolution had an energy cost of 97.8 MWh SY<inline-formula><mml:math id="M115" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>.</p>
      <p id="d1e2612">In a moist simulation with 930 m horizontal grid spacing, we observed
the formation of frontal precipitating systems, containing embedded explicitly resolved convective motions, and additional
cutoff warm-core cyclonic vortices. The explicit representation of embedded moist
convection and the representation of the resolved instabilities exhibits physically different
behavior from coarser-resolution simulations. This is testimony to the usefulness of the
high-resolution approach, as a much expanded range of scales and processes
is simulated. Indeed it
appears that for the current test case, the small vortices have not previously been noted,
as the test case appears to converge for resolutions down to 25 km <xref ref-type="bibr" rid="bib1.bibx63" id="paren.71"/>, but they
clearly emerge at kilometer-scale resolution.</p>
      <p id="d1e2618">These results serve as a baseline benchmark for global climate model
applications. For the 930 m experiment we achieved 0.043 SYPD on 4888 GPU-accelerated
nodes, which is approximately one-seventh of the 0.2–0.3 SYPD required to conduct AMIP-type
simulations. The energy cost for simulations at this horizontal grid spacing was 596 MWh SY<inline-formula><mml:math id="M116" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>.</p>
      <p id="d1e2633">Scalable global models do not use a regularly structured grid such as COSMO, due to the “pole problem”.
Therefore, scalable global models tend to use quasi-uniform grids such as cubed-sphere or icosahedral
grids. We believe that our results apply directly to global weather and climate models employing
structured grids and explicit, split-explicit, or horizontally explicit, vertically implicit (HEVI) time discretizations (e.g., FV3, NICAM).
Global models employing implicit or spectral solvers may have a different scaling behavior.</p>
      <p id="d1e2636">Our work was inspired by the dynamical core solver that won the 2016 Gordon
Bell award at the Supercomputing Conference <xref ref-type="bibr" rid="bib1.bibx67" id="paren.72"/>. The goal of this award is
to reward high performance achieved in the context of a realistic
computation. A direct comparison with the results reported there is difficult, since <xref ref-type="bibr" rid="bib1.bibx67" id="text.73"/>
were running a research version of a dynamical core solver and COSMO is a fully fledged
regional weather and climate model. Nevertheless, our analysis indicates that our benchmarks
represent an improvement both in terms of time to solution and energy to solution.
As far as we know, our results represent the fastest (in terms of application throughput
measured in SYPD at 1 km grid spacing) simulation of a production-ready, non-hydrostatic climate
model on a near-global computational domain.</p>
      <p id="d1e2645">In order to reach the 3–5 SYPD performance necessary for long climate runs, simulations would be needed
that run 100 times faster than the baseline we set here. Given that Piz Daint with NVIDIA P100 GPUs is a
multi-petaflop system, the 1 km scale climate runs at 3–5 SYPD performance represent a formidable
challenge for exascale systems. As such simulations are of interest to scientists around the globe,
we propose that this challenge be defined as a goal post to be reached by the exascale systems that
will be deployed in the next decade.</p>
      <p id="d1e2648">Finally, we propose a new approach to measure the efficiency of memory bound application codes,
like many weather and climate models, running
on modern supercomputing systems. Since data movement is the expensive commodity on modern
processors, we advocate that the code's performance on a given machine be characterized in terms<?pagebreak page1676?> of
data movement efficiency. We note that both detailed mathematical analysis and the general applicability
of our MUE metric is beyond the scope of this paper and requires additional publication. In this work we show
a use case of the MUE metric and demonstrate its precision and usefulness in assessing
memory subsystem utilization of a machine, using
the I/O complexity lower bound as the necessary data transfers,
<inline-formula><mml:math id="M117" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula>, of the algorithm. With the time to solution and the maximum system bandwidth <inline-formula><mml:math id="M118" display="inline"><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo mathvariant="normal" stretchy="false">^</mml:mo></mml:mover></mml:math></inline-formula>, it is possible
to determine the memory usage efficiency that captures how well the code is
optimized both for data locality and bandwidth utilization. It will be interesting to
investigate the MUE metric in future performance evaluations for weather and climate codes on high-performance
computing systems.</p><?xmltex \hack{\newpage}?>
</sec>

      
      </body>
    <back><notes notes-type="codedataavailability">

      <p id="d1e2673">The particular version of the COSMO model used in this study is based on the
official version 5.0 with many additions to enable GPU capability and available under
license (<uri>http://www.cosmo-model.org/content/consortium/licencing.htm</uri> for more information).
These developments are currently in the process of being reintegrated into the mainline COSMO version.
COSMO may be used for operational and for research applications by the members of the COSMO consortium.
Moreover, within a license agreement, the COSMO model may be used for operational and research
applications by other national (hydro-)meteorological services, universities, and research
institutes. The model output data will be archived for a limited amount of time and are available on request.</p>
  </notes><?xmltex \hack{\clearpage}?><app-group>

<?pagebreak page1677?><app id="App1.Ch1.S1">
  <title>Computing the efficiency metric</title>
      <p id="d1e2688">Here, we discuss how to determine the memory usage efficiency (MUE) given in
Eq. (<xref ref-type="disp-formula" rid="Ch1.E2"/>) for an application code, such as the COSMO model.
We need to determine the necessary data transfers <inline-formula><mml:math id="M119" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula>, the maximum system bandwidth
<inline-formula><mml:math id="M120" display="inline"><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo stretchy="false" mathvariant="normal">^</mml:mo></mml:mover></mml:math></inline-formula>, and measure the execution time <inline-formula><mml:math id="M121" display="inline"><mml:mi>T</mml:mi></mml:math></inline-formula>.</p>
<sec id="App1.Ch1.S1.SS1">
  <?xmltex \opttitle{Necessary transfers $Q$}?><title>Necessary transfers <inline-formula><mml:math id="M122" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula></title>
      <p id="d1e2729">A natural representation of data flow and dependencies of algorithms is
a CDAG. This abstraction is
widely used for register allocation optimization, scheduling problems, and communication minimization.
A CDAG models computations as vertices (<inline-formula><mml:math id="M123" display="inline"><mml:mi>V</mml:mi></mml:math></inline-formula>) and communications as edges (<inline-formula><mml:math id="M124" display="inline"><mml:mi>E</mml:mi></mml:math></inline-formula>)
between them. A CDAG can be used to develop theoretical models that
reason about data movements of an application. However, not all the edges
of the CDAG account for data transfers, since the data required by a computation
might be stored in fast memory (cached),
depending on the execution schedule. Finding an execution schedule that minimizes
the transaction metric is NP-hard for general
CDAGs and therefore an intractable problem <xref ref-type="bibr" rid="bib1.bibx36" id="paren.74"/>.</p>
      <p id="d1e2749">Minimizing data movement has been the subject of many studies. Two
main approaches have been established: (1) finding analytical lower
bounds for chosen algorithms for a given machine
model <xref ref-type="bibr" rid="bib1.bibx30 bib1.bibx64 bib1.bibx26" id="paren.75"/> and (2) finding optimal graph
partitions <xref ref-type="bibr" rid="bib1.bibx24" id="paren.76"/>. The former is designed for
particular, highly regular small algorithms, like
sorting <xref ref-type="bibr" rid="bib1.bibx64" id="paren.77"/> or matrix multiplication <xref ref-type="bibr" rid="bib1.bibx30" id="paren.78"/>
and is not suitable for large-scale applications like COSMO. The latter approach
is mostly used for minimizing network
communication <xref ref-type="bibr" rid="bib1.bibx42" id="paren.79"/> and has not been applied to
large applications either. In our performance model, we combine the two
into a novel graph-cutting technique. We build on Hong and Kung's
2S partitioning <xref ref-type="bibr" rid="bib1.bibx30" id="paren.80"/> and construct a hypergraph
partitioning technique to estimate a memory movement lower bound. We do not consider
internode communication here. To the
best of our knowledge, we are the first to apply these techniques to a
real-world parallel application.</p>
      <p id="d1e2771">The key concept behind estimating <inline-formula><mml:math id="M125" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula> is to partition the whole
CDAG <inline-formula><mml:math id="M126" display="inline"><mml:mrow><mml:mi>G</mml:mi><mml:mo>=</mml:mo><mml:mo>(</mml:mo><mml:mi>V</mml:mi><mml:mo>,</mml:mo><mml:mi>E</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> into subcomputations
(2<inline-formula><mml:math id="M127" display="inline"><mml:mi>S</mml:mi></mml:math></inline-formula> partitions) <inline-formula><mml:math id="M128" display="inline"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>:</mml:mo><mml:mo>⋃</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>V</mml:mi></mml:mrow></mml:math></inline-formula>,
such that each <inline-formula><mml:math id="M129" display="inline"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> requires at most <inline-formula><mml:math id="M130" display="inline"><mml:mi>S</mml:mi></mml:math></inline-formula> data transfer operations. Then, if
<inline-formula><mml:math id="M131" display="inline"><mml:mrow><mml:mi>H</mml:mi><mml:mo>(</mml:mo><mml:mn mathvariant="normal">2</mml:mn><mml:mi>S</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> is the minimal number of 2<inline-formula><mml:math id="M132" display="inline"><mml:mi>S</mml:mi></mml:math></inline-formula> partitions for a given CDAG,
<xref ref-type="bibr" rid="bib1.bibx30" id="text.81"/> showed that the minimal
number <inline-formula><mml:math id="M133" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula> of memory movement operations for any valid execution
of the
CDAG is bounded by

                <disp-formula id="App1.Ch1.E1" content-type="numbered"><mml:math id="M134" display="block"><mml:mrow><mml:mstyle class="stylechange" displaystyle="true"/><mml:mi>Q</mml:mi><mml:mo>≥</mml:mo><mml:mi>S</mml:mi><mml:mo>×</mml:mo><mml:mo>(</mml:mo><mml:mi>H</mml:mi><mml:mo>(</mml:mo><mml:mn mathvariant="normal">2</mml:mn><mml:mi>S</mml:mi><mml:mo>)</mml:mo><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn><mml:mo>)</mml:mo><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula></p>
      <p id="d1e2919">Here we outline the key steps of our modeling approach:
<list list-type="order"><list-item>
      <p id="d1e2924">We reduce Hong and Kung's 2<inline-formula><mml:math id="M135" display="inline"><mml:mi>S</mml:mi></mml:math></inline-formula> partitioning <xref ref-type="bibr" rid="bib1.bibx30" id="paren.82"/>
definition to hypergraph cut by relaxing the constraints on the dominator
and minimum set sizes. Each hyper-edge contains a vertex from the original
CDAG and all its successors.</p></list-item><list-item>
      <p id="d1e2938">We approximate the minimal hypergraph cut by minimizing the
total communication volume.</p></list-item><list-item>
      <p id="d1e2942">We then express the memory movement lower bound as<disp-formula id="App1.Ch1.E2" content-type="numbered"><mml:math id="M136" display="block"><mml:mstyle displaystyle="true" class="stylechange"/><mml:mrow><mml:mstyle class="stylechange" displaystyle="true"/><mml:mtext>min</mml:mtext><mml:munder><mml:mo movablelimits="false">∑</mml:mo><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:munder><mml:munder><mml:mo movablelimits="false">∑</mml:mo><mml:mrow><mml:mi>v</mml:mi><mml:mo>∈</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:munder><mml:mi>w</mml:mi><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo><mml:mo>⋅</mml:mo><mml:mo>(</mml:mo><mml:mtext>Nbr</mml:mtext><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn><mml:mo>)</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>subject to<disp-formula specific-use="align" content-type="numbered"><mml:math id="M137" display="block"><mml:mtable displaystyle="true"><mml:mlabeledtr id="App1.Ch1.E3"><mml:mtd/><mml:mtd><mml:mstyle class="stylechange" displaystyle="true"/></mml:mtd><mml:mtd><mml:mrow><mml:mstyle class="stylechange" displaystyle="true"/><mml:mo movablelimits="false">⋃</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>V</mml:mi><mml:mo>,</mml:mo></mml:mrow></mml:mtd></mml:mlabeledtr><mml:mlabeledtr id="App1.Ch1.E4"><mml:mtd/><mml:mtd><mml:mstyle class="stylechange" displaystyle="true"/></mml:mtd><mml:mtd><mml:mrow><mml:mstyle class="stylechange" displaystyle="true"/><mml:msub><mml:mo>∀</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>≠</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>∩</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi mathvariant="normal">∅</mml:mi><mml:mo>,</mml:mo></mml:mrow></mml:mtd></mml:mlabeledtr><mml:mlabeledtr id="App1.Ch1.E5"><mml:mtd/><mml:mtd><mml:mstyle displaystyle="true" class="stylechange"/></mml:mtd><mml:mtd><mml:mrow><mml:mstyle displaystyle="true" class="stylechange"/><mml:msub><mml:mo>∀</mml:mo><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msub><mml:munder><mml:mo movablelimits="false">∑</mml:mo><mml:mrow><mml:mi>v</mml:mi><mml:mo>∈</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:munder><mml:mi>w</mml:mi><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo><mml:mo>⋅</mml:mo><mml:mo>(</mml:mo><mml:mtext>Nbr</mml:mtext><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn><mml:mo>)</mml:mo><mml:mo>≤</mml:mo><mml:mn mathvariant="normal">2</mml:mn><mml:mi>S</mml:mi><mml:mo>,</mml:mo></mml:mrow></mml:mtd></mml:mlabeledtr></mml:mtable></mml:math></disp-formula>where Nbr<inline-formula><mml:math id="M138" display="inline"><mml:mrow><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> is the number of partitions that vertex <inline-formula><mml:math id="M139" display="inline"><mml:mi>v</mml:mi></mml:math></inline-formula>
is adjacent to, <inline-formula><mml:math id="M140" display="inline"><mml:mrow><mml:mi>w</mml:mi><mml:mo>(</mml:mo><mml:mi>v</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> is the memory size of vertex <inline-formula><mml:math id="M141" display="inline"><mml:mi>v</mml:mi></mml:math></inline-formula>, and <inline-formula><mml:math id="M142" display="inline"><mml:mi>S</mml:mi></mml:math></inline-formula> is
the size of memory at a level for which the optimization is
performed. Equation (<xref ref-type="disp-formula" rid="App1.Ch1.E2"/>) now minimizes the
sum of the communication volume across all partitions (assuming we
load partitions one after the other), while constraint
Eq. (<xref ref-type="disp-formula" rid="App1.Ch1.E5"/>) bounds the boundary weight for each
partition to <inline-formula><mml:math id="M143" display="inline"><mml:mrow><mml:mn mathvariant="normal">2</mml:mn><mml:mi>S</mml:mi></mml:mrow></mml:math></inline-formula> such that it fits in fast memory.</p></list-item></list></p>
</sec>
<sec id="App1.Ch1.S1.SS2">
  <title>COSMO CDAG</title>

<?xmltex \floatpos{t}?><table-wrap id="App1.Ch1.T1"><caption><p id="d1e3202">COSMO CDAGs at various GPU memory hierarchy levels.</p></caption><oasis:table frame="topbot"><oasis:tgroup cols="5">
     <oasis:colspec colnum="1" colname="col1" align="left"/>
     <oasis:colspec colnum="2" colname="col2" align="left"/>
     <oasis:colspec colnum="3" colname="col3" align="right"/>
     <oasis:colspec colnum="4" colname="col4" align="right"/>
     <oasis:colspec colnum="5" colname="col5" align="right"/>
     <oasis:thead>
       <oasis:row rowsep="1">
         <oasis:entry colname="col1">Memory level</oasis:entry>
         <oasis:entry colname="col2">Vertex def.</oasis:entry>
         <oasis:entry colname="col3"><inline-formula><mml:math id="M144" display="inline"><mml:mrow><mml:mo>|</mml:mo><mml:mi>V</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col4"><inline-formula><mml:math id="M145" display="inline"><mml:mrow><mml:mo>|</mml:mo><mml:mi>E</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:math></inline-formula></oasis:entry>
         <oasis:entry colname="col5"><inline-formula><mml:math id="M146" display="inline"><mml:mi>S</mml:mi></mml:math></inline-formula></oasis:entry>
       </oasis:row>
     </oasis:thead>
     <oasis:tbody>
       <oasis:row>
         <oasis:entry colname="col1">Registers</oasis:entry>
         <oasis:entry colname="col2">one IJK value</oasis:entry>
         <oasis:entry colname="col3">157 803</oasis:entry>
         <oasis:entry colname="col4">984 101</oasis:entry>
         <oasis:entry colname="col5">32</oasis:entry>
       </oasis:row>
       <oasis:row>
         <oasis:entry colname="col1">Shared memory</oasis:entry>
         <oasis:entry colname="col2">one IJ plane</oasis:entry>
         <oasis:entry colname="col3">2649</oasis:entry>
         <oasis:entry colname="col4">12 137</oasis:entry>
         <oasis:entry colname="col5">8</oasis:entry>
       </oasis:row>
       <oasis:row>
         <oasis:entry colname="col1">L2 cache</oasis:entry>
         <oasis:entry colname="col2">whole array</oasis:entry>
         <oasis:entry colname="col3">1912</oasis:entry>
         <oasis:entry colname="col4">9863</oasis:entry>
         <oasis:entry colname="col5">29</oasis:entry>
       </oasis:row>
     </oasis:tbody>
   </oasis:tgroup></oasis:table></table-wrap>

      <p id="d1e3324">Figure <xref ref-type="fig" rid="Ch1.F2"/> shows the data dependency CDAG of the
computational kernels of the dynamical core of COSMO,
where each kernel corresponds to a complex set of fused stencil operations in
order to maximize the
data locality of the algorithm. A single time step in COSMO accesses 781
variables, each of which is
represented by a <inline-formula><mml:math id="M147" display="inline"><mml:mrow><mml:mn mathvariant="normal">346</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">340</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">60</mml:mn></mml:mrow></mml:math></inline-formula> array for our 930 m simulation. Some
variables are updated multiple times during a time step which results in
a total number of variable accesses (CDAG vertices) of more than
<inline-formula><mml:math id="M148" display="inline"><mml:mrow><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">10</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula>.
The resulting large and complex graph makes estimating <inline-formula><mml:math id="M149" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula> impractical.
In order to reduce the complexity, one can coarsen the CDAG by grouping multiple accesses
into a single vertex. As an example, Fig. <xref ref-type="fig" rid="Ch1.F2"/> shows
the coarsest representation of the CDAG where each vertex models a full kernel.
Each<?pagebreak page1678?> kernel may read and write various output variables, compute
multiple stencil operations and boundary conditions, or perform halo
exchanges.
Thus, in this coarsened version, valuable data dependency information
is lost, and one cannot argue about the optimality
and possible rearrangement of the operations fused within a kernel.</p>
      <p id="d1e3365">We now describe how we determine coarsening strategies of the
COSMO CDAG for three levels of the memory hierarchy of our target
system, registers, shared memory, and L2 cache.
<list list-type="order"><list-item>
      <p id="d1e3370">Registers (65'536): the COSMO GPU implementation assigns all
computations
accessing variables with the same IJ coordinate to the same GPU
thread and use registers to reuse values in the K direction. To model this
memory hierarchy, it is only necessary to keep the stencil accesses
in the K direction. Thus, all accesses in the IJ plane are represented
as a single vertex in the CDAG, which is then simplified to
781 variables and their dependencies
among all 60 levels in K.</p></list-item><list-item>
      <p id="d1e3374">Shared memory (64 kB): the shared memory of the GPU is
used to communicate values between the different compute threads. In order
to model this layer, all different accesses in the K direction of a variable
are represented as a single vertex in the CDAG, while all accesses in the IJ
plane are kept.</p></list-item><list-item>
      <p id="d1e3378">L2 cache (4 MB): this last cache level before DRAM is
used to store whole arrays (fields). In this layer all accesses to a variable
in any direction are represented as a vertex in the CDAG.
It keeps only the data dependencies among variables, irrespective of the
offset and direction of the access.</p></list-item></list>
Table <xref ref-type="table" rid="App1.Ch1.T1"/> lists details about the CDAGs at each of
our three levels. Memory capacity of the GPU for each of the three layers is then used
as a constraint to derive the parameter <inline-formula><mml:math id="M150" display="inline"><mml:mi>S</mml:mi></mml:math></inline-formula> (see values in Table <xref ref-type="table" rid="App1.Ch1.T1"/>).
Values of the estimation of <inline-formula><mml:math id="M151" display="inline"><mml:mi>Q</mml:mi></mml:math></inline-formula>, obtained from the performance model for the three memory
levels are shown in Table <xref ref-type="table" rid="Ch1.T2"/>.</p>

      <fig id="App1.Ch1.F1"><caption><p id="d1e3404">Bandwidth of the representative stencil benchmarks and GPU STREAM on
Tesla P100. All kernels (except for GPU STREAM) operate on a 3-D domain.</p></caption>
          <?xmltex \igopts{width=241.848425pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/1665/2018/gmd-11-1665-2018-f07.pdf"/>

        </fig>

      <p id="d1e3414">To generate the whole CDAG, we used the STELLA analysis back
end <xref ref-type="bibr" rid="bib1.bibx28" id="paren.83"/> to trace all local memory accesses to all fields.
Based on the information from the access offsets and order of
operations, we reconstruct the read–write, write–read, and write–write
dependencies.</p>
</sec>
<sec id="App1.Ch1.S1.SS3">
  <?xmltex \opttitle{Maximum achievable bandwidth $B$}?><title>Maximum achievable bandwidth <inline-formula><mml:math id="M152" display="inline"><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo stretchy="false" mathvariant="normal">^</mml:mo></mml:mover></mml:math></inline-formula></title>
      <p id="d1e3444">We now describe how we measure the memory usage efficiency in practice.
We start by describing how to determine the maximum achievable bandwidth
for COSMO stencils. Even though the Tesla P100 has a theoretical peak memory bandwidth of
720 GB s<inline-formula><mml:math id="M154" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> <xref ref-type="bibr" rid="bib1.bibx48" id="paren.84"/>, we argue that this may not be achievable for
real applications.
A well-established method to measure the maximum achievable bandwidth of
GPUs is the GPU STREAM benchmark <xref ref-type="bibr" rid="bib1.bibx15" id="paren.85"/>. Our tests show that
the maximum achievable bandwidth for COPY is 557 GB s<inline-formula><mml:math id="M155" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> if at least 30 MB
double-precision numbers are copied (Fig. <xref ref-type="fig" rid="App1.Ch1.F1"/>).
However, stencil codes on multidimensional domains like COSMO require
more complex memory access patterns that, even when highly tuned, cannot
achieve the same bandwidth as STREAM due to architectural limitations.</p>
      <p id="d1e3479">We identified the most common patterns and designed and tuned a set of
micro-benchmarks that only mimic the memory access patterns without
the computations to investigate the machine capability of handling
memory accesses for stencils. They include aligned, unaligned, and
strided patterns in all dimensions.  All benchmarks operate on a 3-D
domain of parametric size, on either single- or double-precision numbers.
The results of a representative set of four chosen micro-benchmarks are
shown in Fig. <xref ref-type="fig" rid="App1.Ch1.F1"/>, together with the GPU STREAM,
which operates on 1-D domain. The fastest stencil kernel (double-precision aligned COPY) reaches 510 GB s<inline-formula><mml:math id="M156" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>. The slowdown compared to
GPU STREAM is due to the more complex access pattern in the 3-D domain.
Furthermore, using single-precision numbers further deteriorates the
bandwidth on P100 (COPY (float) reaches 475 GB s<inline-formula><mml:math id="M157" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>). Our COSMO 930 m run
uses predominantly single-precision numbers on a <inline-formula><mml:math id="M158" display="inline"><mml:mrow><mml:mn mathvariant="normal">346</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">340</mml:mn><mml:mo>×</mml:mo><mml:mn mathvariant="normal">60</mml:mn></mml:mrow></mml:math></inline-formula> domain, which results in
28.2 MB of data per field. Our measurements show that the maximum
achievable bandwidth for this setup is 362 GB s<inline-formula><mml:math id="M159" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> (in the best case of the
simple COPY benchmark). We will use this upper-bound number as the
maximum system bandwidth. The average measured memory bandwidth across
all COSMO real-world stencils is 276 GB s<inline-formula><mml:math id="M160" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, which gives <inline-formula><mml:math id="M161" display="inline"><mml:mrow><mml:mstyle displaystyle="false"><mml:mfrac style="text"><mml:mi>B</mml:mi><mml:mover accent="true"><mml:mi>B</mml:mi><mml:mo mathvariant="normal" stretchy="false">^</mml:mo></mml:mover></mml:mfrac></mml:mstyle><mml:mo>=</mml:mo><mml:mn mathvariant="normal">0.76</mml:mn></mml:mrow></mml:math></inline-formula>.</p><?xmltex \hack{\clearpage}?>
</sec>
</app>
  </app-group><notes notes-type="competinginterests">

      <p id="d1e3574">The authors declare that they have no conflict of interest.</p>
  </notes><ack><title>Acknowledgements</title><p id="d1e3580">This work was supported by the Swiss National Science Foundation under Sinergia grant
CRSII2_154486/1 and by a grant from the Swiss National Supercomputing Centre and PRACE.
We would like to acknowledge the many contributors to the GPU-capable version of COSMO
used in this study, among others Andrea Arteaga, Mauro Bianco, Isabelle Bey,
Christophe Charpilloz, Valentin Clement, Ben Cumming, Tiziano Diamanti, Tobias Gysi,
Peter Messmer, Katherine Osterried, Anne Roches, Stefan Rüdisühli, and Pascal Spörri.
Also, the authors would like to acknowledge the Center for Climate Systems Modeling
(C2SM) and the Federal Office of Meteorology and Climatology MeteoSwiss for their support.
Furthermore, we would like to thank Nils Wedi and Piotr Smolarkiewicz (ECMWF) for useful
comments and discussions. Finally, we would like to thank an anonymous reviewer,
Rupert W. Ford (STFC), and the topical editor of GMD (Sophie Valcke) for their constructive
comments and reviews that significantly improved the clarity and quality of the manuscript.<?xmltex \hack{\newline}?><?xmltex \hack{\newline}?>
Edited by: Sophie Valcke<?xmltex \hack{\newline}?>
Reviewed by: Rupert W. Ford and one anonymous referee</p></ack><ref-list>
    <title>References</title>

      <ref id="bib1.bibx1"><label>Alverson et al.(2012)Alverson, Froese, Kaplan, and
Roweth</label><mixed-citation>Alverson, B., Froese, E., Kaplan, L., and Roweth, D.: Cray XC Series Network,
Tech. Rep., available at: <uri>https://www.cray.com/sites/default/files/resources/CrayXCNetwork.pdf</uri> (last access: 3 April 2017), 2012.</mixed-citation></ref>
      <ref id="bib1.bibx2"><label>Balaji et al.(2017)Balaji, Maisonnave, Zadeh, Lawrence, Biercamp,
Fladrich, Aloisio, Benson, Caubel, Durachta, Foujols, Lister, Mocavero,
Underwood, and Wright</label><mixed-citation>Balaji, V., Maisonnave, E., Zadeh, N., Lawrence, B. N., Biercamp, J.,
Fladrich, U., Aloisio, G., Benson, R., Caubel, A., Durachta, J., Foujols,
M.-A., Lister, G., Mocavero, S., Underwood, S., and Wright, G.: CPMIP:
measurements of real computational performance of Earth system models in
CMIP6, Geosci. Model Dev., 10, 19–34, <ext-link xlink:href="https://doi.org/10.5194/gmd-10-19-2017" ext-link-type="DOI">10.5194/gmd-10-19-2017</ext-link>,
2017.</mixed-citation></ref>
      <ref id="bib1.bibx3"><label>Baldauf et al.(2011)Baldauf, Seifert, Foerstner, Majewski,
Raschendorfer, and Reinhardt</label><mixed-citation>
Baldauf, M., Seifert, A., Foerstner, J., Majewski, D., Raschendorfer, M., and
Reinhardt, T.: Operational Convective-Scale Numerical Weather Prediction with
the COSMO Model: Description and Sensitivities, Mon. Weather Rev., 139,
3887–3905, 2011.</mixed-citation></ref>
      <ref id="bib1.bibx4"><?xmltex \def\ref@label{{Ban et~al.(2015)Ban, Schmidli, and Sch{\"{a}}r}}?><label>Ban et al.(2015)Ban, Schmidli, and Schär</label><mixed-citation>
Ban, N., Schmidli, J., and Schär, C.: Heavy precipitation in a changing
climate: Does short-term summer precipitation increase faster?, Geophys.
Res. Lett., 42, 1165–1172,   2015.</mixed-citation></ref>
      <ref id="bib1.bibx5"><label>Benoit et al.(2002)</label><mixed-citation>
Benoit, R., Schär, C., Binder, P., Chamberland, S., Davies, H. C., Desgagné,
M., Girard, C., Keil, C., Kouwen, N., Lüthi, D., Maric, D., Müller, E.,
Pellerin, P., Schmidli, J., Schubiger, F., Schwierz, C., Sprenger, M.,
Walser, A., Willemse, S., Yu, W., and Zala, E.: The Real-Time Ultrafinescale
Forecast Support during the
Special Observing Period of the MAP, B. Am. Meteorol.
Soc., 83, 85–109, 2002.</mixed-citation></ref>
      <ref id="bib1.bibx6"><label>Bony et al.(2015)</label><mixed-citation>
Bony, S.,  Stevens, B.,   Frierson, D. M. W.,   Jakob, C.,
Kageyama, M.,  Pincus, R.,   Shepherd, T. G.,   Sherwood, S. C.,
Siebesma, A. P.,  Sobel, A. H.,   Watanabe, M.,
and Webb, M. J.: Clouds, circulation and climate sensitivity, Nat. Geosci.,
8, 261–268, 2015.</mixed-citation></ref>
      <ref id="bib1.bibx7"><label>Borkar and Chien(2011)</label><mixed-citation>
Borkar, S. and Chien, A. A.: The Future of Microprocessors, Commun. ACM, 54,
67–77, 2011.</mixed-citation></ref>
      <ref id="bib1.bibx8"><label>Bott(1989)</label><mixed-citation>
Bott, A.: A positive definite advection scheme obtained by nonlinear
renormalization of the advective fluxes, Mon. Weather Rev., 117,
1006–1016, 1989.</mixed-citation></ref>
      <ref id="bib1.bibx9"><label>Boucher et al.(2013)Boucher, Randall et al.</label><mixed-citation>Boucher, O., Randall, D., Artaxo, P., Bretherton, C., Feingold, G., Forster, P.,
Kerminen, V.-M., Kondo, Y., Liao, H., Lohmann, U., Rasch, P., Satheesh, S. K.,
Sherwood, S., Stevens, B., and Zhang, X. Y.: Clouds and Aerosols. In: Climate
Change 2013: The Physical Science Basis. Contribution of Working Group I to
the Fifth Assessment Report of the Intergovernmental Panel on Climate Change,
edited by: Stocker, T. F., Qin, D., Plattner, G.-K., Tignor, M., Allen, S. K.,
Boschung, J., Nauels,
A., Xia, Y., Bex, V., and Midgley, P. M., Cambridge University
Press, Cambridge, United Kingdom and New York, NY, USA, 571–658,
<ext-link xlink:href="https://doi.org/10.1017/CBO9781107415324.016" ext-link-type="DOI">10.1017/CBO9781107415324.016</ext-link>, 2013.</mixed-citation></ref>
      <ref id="bib1.bibx10"><label>Bretherton and Khairoutdinov(2015)</label><mixed-citation>
Bretherton, C. S. and Khairoutdinov, M. F.: Convective self-aggregation
feedbacks in near-global cloud-resolving simulations of an aquaplanet,
J. Adv. Model. Earth Syst., 7, 1765–1787, 2015.</mixed-citation></ref>
      <ref id="bib1.bibx11"><label>CLM-Community(2017)</label><mixed-citation>CLM-Community: Climate Limited-area Modelling Community,
available at: <uri>http://www.clm-community.eu/</uri>, last access: 3 April, 2017.</mixed-citation></ref>
      <ref id="bib1.bibx12"><label>COSMO(2017)</label><mixed-citation>COSMO: Consortium for Small-Scale Modeling,
available at: <uri>http://www.cosmo-model.org/</uri>, last access: 4 April, 2017.</mixed-citation></ref>
      <ref id="bib1.bibx13"><label>Davies et al.(2003)Davies, Staniforth, Wood, and
Thuburn</label><mixed-citation>
Davies, T., Staniforth, A., Wood, N., and Thuburn, J.: Validity of anelastic
and other equation sets as inferred from normal-mode analysis, Q.
J. Roy. Meteorol. Soc., 129, 2761–2775, 2003.</mixed-citation></ref>
      <ref id="bib1.bibx14"><label>Davies et al.(2005)</label><mixed-citation>Davies, T., Cullen, M. J., Malcolm, A. J., Mawson, M. H., Staniforth, A.,
White, A. A., and Wood, N.: A new dynamical core for the Met Office's
global and regional modelling of the atmosphere,  Q. J. Roy. Meteorol. Soc.,
131, 1759–1782, <ext-link xlink:href="https://doi.org/10.1256/qj.04.101" ext-link-type="DOI">10.1256/qj.04.101</ext-link>, 2005.</mixed-citation></ref>
      <ref id="bib1.bibx15"><label>Deakin et al.(2016)Deakin, Price, Martineau, and
McIntosh-Smith</label><mixed-citation>
Deakin, T., Price, J., Martineau, M., and McIntosh-Smith, S.: GPU-STREAM v2.0:
Benchmarking the Achievable Memory Bandwidth of Many-Core Processors Across
Diverse Parallel Programming Models, 489–507, Springer, Cham, 2016.</mixed-citation></ref>
      <ref id="bib1.bibx16"><?xmltex \def\ref@label{{Doms and Sch{\"{a}}ttler(1999)}}?><label>Doms and Schättler(1999)</label><mixed-citation>Doms, G. and Schättler, U.: The nonhydrostatic limited-area model LM
(Lokal-Modell) of the DWD. Part I: Scientific documentation, Tech.
rep., German Weather Service (DWD), Offenbach, Germany,
available at: <uri>http://www.cosmo-model.org/</uri> (last access: 19 March 2018), 1999.</mixed-citation></ref>
      <ref id="bib1.bibx17"><label>Durran(2010)</label><mixed-citation>
Durran, D. R.: Numerical Methods for Fluid Dynamics with Applications to
Geophysics, Vol. 32 of Texts in Applied Mathematics, Springer, New
York, 2010.</mixed-citation></ref>
      <ref id="bib1.bibx18"><label>ECMWF(2016)</label><mixed-citation>
ECMWF: IFS Documentation Part III: Dynamics and numerical procedures, European
Centre for Medium-Range Weather Forecasts, Shinfield Park, Reading, RG2 9AX,
England, 2016.</mixed-citation></ref>
      <ref id="bib1.bibx19"><label>Ewing and Wang(2001)</label><mixed-citation>
Ewing, R. E. and Wang, H.: A summary of numerical methods for time-dependent
advection-dominated partial differential equations, J. Comput.
Appl. Math., 128, 423–445, numerical Analysis 2000, Vol. VII:
Partial Differential Equations, 2001.</mixed-citation></ref>
      <ref id="bib1.bibx20"><label>Eyring et al.(2016)</label><mixed-citation>Eyring, V., Bony, S., Meehl, G. A., Senior, C. A., Stevens, B., Stouffer, R.
J., and Taylor, K. E.: Overview of the Coupled Model Intercomparison Project
Phase 6 (CMIP6) experimental design and organization, Geosci. Model Dev., 9,
1937–1958, <ext-link xlink:href="https://doi.org/10.5194/gmd-9-1937-2016" ext-link-type="DOI">10.5194/gmd-9-1937-2016</ext-link>, 2016.</mixed-citation></ref>
      <ref id="bib1.bibx21"><label>Fourestey et al.(2014)</label><mixed-citation>Fourestey, G., Cumming, B., Gilly, L., and Schulthess, T. C.:
First Experiences With Validating and Using the Cray
Power Management Database Tool, CoRR, abs/1408.2657, available at: <uri>http://arxiv.org/abs/1408.2657</uri>
(last access: 20 April 2018), 2014.</mixed-citation></ref>
      <?pagebreak page1680?><ref id="bib1.bibx22"><label>Fu et al.(2004)Fu, Niino, Kimura, and Kato</label><mixed-citation>
Fu, G., Niino, H., Kimura, R., and Kato, T.: Multiple Polar Mesocyclones over
the Japan Sea on 11 February 1997, Mon. Weather Rev., 132, 793–814,
2004.</mixed-citation></ref>
      <ref id="bib1.bibx23"><label>Fuhrer et al.(2014)</label><mixed-citation>Fuhrer, O., Osuna, C., Lapillonne, X., Gysi, T., Cumming, B., Bianco, M.,
Arteaga, A., and Schulthess, T. C.: Towards a performance portable, architecture agnostic
implementation strategy for weather and climate models, Supercomputing
frontiers and innovations, 1,
available at: <uri>http://superfri.org/superfri/article/view/17</uri> (last access: 17 March 2018), 2014.</mixed-citation></ref>
      <ref id="bib1.bibx24"><label>Gadde(2013)</label><mixed-citation>
Gadde, S.: Graph partitioning algorithms for minimizing inter-node
communication on a distributed system, Ph.D. thesis, The University of
Toledo, 2013.</mixed-citation></ref>
      <ref id="bib1.bibx25"><label>Giraldo et al.(2013)Giraldo, Kelly, and Constantinescu</label><mixed-citation>Giraldo, F. X., Kelly, J. F., and Constantinescu, E. M.: Implicit-Explicit
Formulations of a Three-Dimensional Nonhydrostatic Unified Model of the
Atmosphere (NUMA), SIAM J. Sci. Comp., 35, B1162–B1194,
<ext-link xlink:href="https://doi.org/10.1137/120876034" ext-link-type="DOI">10.1137/120876034</ext-link>,   2013.</mixed-citation></ref>
      <ref id="bib1.bibx26"><label>Goodrich et al.(2010)Goodrich, Sitchinava, and Arge</label><mixed-citation>
Goodrich, M. T., Sitchinava, N., and Arge, L.: Parallel external memory graph
algorithms, 2010 IEEE International Symposium on Parallel and Distributed
Processing (IPDPS), 00, 1–11, 2010.</mixed-citation></ref>
      <ref id="bib1.bibx27"><?xmltex \def\ref@label{{Gysi et~al.(2015{\natexlab{a}})Gysi, Grosser, and Hoefler}}?><label>Gysi et al.(2015a)Gysi, Grosser, and Hoefler</label><mixed-citation>
Gysi, T., Grosser, T., and Hoefler, T.: MODESTO: Data-centric Analytic
Optimization of Complex Stencil Programs on Heterogeneous Architectures, in:
Proceedings of the 29th ACM on International Conference on Supercomputing,
ICS '15,  177–186, ACM, New York, NY, USA, 2015a.</mixed-citation></ref>
      <ref id="bib1.bibx28"><?xmltex \def\ref@label{{Gysi et~al.(2015{\natexlab{b}})Gysi, Osuna, Fuhrer, Bianco, and
Schulthess}}?><label>Gysi et al.(2015b)Gysi, Osuna, Fuhrer, Bianco, and
Schulthess</label><mixed-citation>
Gysi, T., Osuna, C., Fuhrer, O., Bianco, M., and Schulthess, T. C.: STELLA: A
Domain-specific Tool for Structured Grid Methods in Weather and Climate
Models, in: Proc. of the Intl. Conf. for High Performance Computing,
Networking, Storage and Analysis, SC '15,  41:1–41:12, ACM, New York, NY,
USA, 2015b.</mixed-citation></ref>
      <ref id="bib1.bibx29"><label>Heise et al.(2006)Heise, Ritter, and Schrodin</label><mixed-citation>
Heise, E., Ritter, B., and Schrodin, R.: Operational implementation of the
multilayer soil model, COSMO Tech. Rep., No. 9, Tech. rep., COSMO, 2006.</mixed-citation></ref>
      <ref id="bib1.bibx30"><label>Hong and Kung(1981)</label><mixed-citation>
Hong, J.-W. and Kung, H. T.: I/O Complexity: The Red-blue Pebble Game, in:
Proceedings of the Thirteenth Annual ACM Symposium on Theory of Computing,
STOC '81, 326–333, ACM, New York, NY, USA, 1981.</mixed-citation></ref>
      <ref id="bib1.bibx31"><label>Hope(2015)</label><mixed-citation>Hope, C.: The $10 trillion value of better information about the transient
climate response, Philos. T. Roy. Soc. A, 373, 2054, <ext-link xlink:href="https://doi.org/10.1098/rsta.2014.0429" ext-link-type="DOI">10.1098/rsta.2014.0429</ext-link>,
2015.</mixed-citation></ref>
      <ref id="bib1.bibx32"><label>Hutter et al.(2014)Hutter, Iannuzzi, Schiffmann, and
VandeVondele</label><mixed-citation>
Hutter, J., Iannuzzi, M., Schiffmann, F., and VandeVondele, J.: CP2K: atomistic
simulations of condensed matter systems, Wiley Interdisciplinary Reviews:
Computational Molecular Science, 4, 15–25, 2014.</mixed-citation></ref>
      <ref id="bib1.bibx33"><label>Jablonowski and Williamson(2006)</label><mixed-citation>
Jablonowski, C. and Williamson, D. L.: A baroclinic instability test case for
atmospheric model dynamical cores, Q. J. Roy.
Meteorol. Soc., 132, 2943–2975, 2006.</mixed-citation></ref>
      <ref id="bib1.bibx34"><label>Karypis and Kumar(2009)</label><mixed-citation>Karypis, G. and Kumar, V.: MeTis: Unstructured Graph Partitioning and Sparse
Matrix Ordering System, available at: <uri>http://www.cs.umn.edu/~metis</uri> (last access: 17 March 2018), 2009.</mixed-citation></ref>
      <ref id="bib1.bibx35"><label>Kendon et al.(2014)</label><mixed-citation>
Kendon, E. J., Roberts, N. M., Fowler, H. J., Roberts,
M. J., Chan, S. C., and Senior, C. A.: Heavier summer downpours with climate change revealed by
weather forecast resolution model, Nat. Clim. Change, 4, 570–576, 2014.</mixed-citation></ref>
      <ref id="bib1.bibx36"><label>Kwok and Ahmad(1999)</label><mixed-citation>
Kwok, Y.-K. and Ahmad, I.: Static Scheduling Algorithms for Allocating Directed
Task Graphs to Multiprocessors, ACM Comput. Surv., 31, 406–471, 1999.</mixed-citation></ref>
      <ref id="bib1.bibx37"><label>Lapillonne and Fuhrer(2014)</label><mixed-citation>Lapillonne, X. and Fuhrer, O.: Using Compiler Directives to Port Large
Scientific Applications to GPUs: An Example from Atmospheric Science,
Parallel Processing Letters, 24, 1450003, <ext-link xlink:href="https://doi.org/10.1142/S0129626414500030" ext-link-type="DOI">10.1142/S0129626414500030</ext-link>, 2014.</mixed-citation></ref>
      <ref id="bib1.bibx38"><label>Lapillonne et al.(2016)</label><mixed-citation>
Lapillonne, X., Fuhrer, O., Spörri, P., Osuna, C.,
Walser, A., Arteaga, A., Gysi, T., Rüdisühli, S., Osterried, K., and
Schulthess, T.: Operational numerical weather prediction on a
GPU-accelerated cluster supercomputer, in: EGU General Assembly Conference
Abstracts, Vol. 18 of  EGU General Assembly Conference Abstracts, p. 13554, 2016.</mixed-citation></ref>
      <ref id="bib1.bibx39"><label>Lee et al.(2010)</label><mixed-citation>
Lee, V. W., Kim, C., Chhugani, J., Deisher, M.,
Kim, D., Nguyen, A. D., Satish, N., Smelyanskiy,
M., Chennupaty, S., Hammarlund, P., Singhal, R., and
Dubey, P.: Debunking the 100X GPU vs. CPU Myth: An Evaluation of
Throughput Computing on CPU and GPU, SIGARCH Comput. Archit. News, 38,
451–460, 2010.</mixed-citation></ref>
      <ref id="bib1.bibx40"><?xmltex \def\ref@label{{Leutwyler et~al.(2016)Leutwyler, Fuhrer, Lapillonne, L{\"{u}}thi, and
Sch{\"{a}}r}}?><label>Leutwyler et al.(2016)Leutwyler, Fuhrer, Lapillonne, Lüthi, and
Schär</label><mixed-citation>Leutwyler, D., Fuhrer, O., Lapillonne, X., Lüthi, D., and Schär, C.: Towards
European-scale convection-resolving climate simulations with GPUs: a study
with COSMO 4.19, Geosci. Model Dev., 9, 3393–3412,
<ext-link xlink:href="https://doi.org/10.5194/gmd-9-3393-2016" ext-link-type="DOI">10.5194/gmd-9-3393-2016</ext-link>, 2016.</mixed-citation></ref>
      <ref id="bib1.bibx41"><?xmltex \def\ref@label{{Leutwyler et~al.(2017)Leutwyler, L\"{u}thi, Ban, Fuhrer, and
Sch\"{a}r}}?><label>Leutwyler et al.(2017)Leutwyler, Lüthi, Ban, Fuhrer, and
Schär</label><mixed-citation>Leutwyler, D., Lüthi, D., Ban, N., Fuhrer, O., and Schär, C.: Evaluation of
the convection-resolving climate modeling approach on continental scales,
J. Geophys. Res.-Atmos., 122, 5237–5258,
<ext-link xlink:href="https://doi.org/10.1002/2016JD026013" ext-link-type="DOI">10.1002/2016JD026013</ext-link>,  2017.</mixed-citation></ref>
      <ref id="bib1.bibx42"><label>Liu et al.(2014)Liu, Zhang, and Zhang</label><mixed-citation>Liu, L., Zhang, T., and Zhang, J.: DAG Based Multipath Routing Algorithm for
Load Balancing in Machine-to-Machine Networks, Int. J. Distrib. Sens. N.,
10, 457962, <ext-link xlink:href="https://doi.org/10.1155/2014/457962" ext-link-type="DOI">10.1155/2014/457962</ext-link>, 2014.</mixed-citation></ref>
      <ref id="bib1.bibx43"><label>Ludwig et al.(2015)Ludwig, Pinto, Hoepp, Fink, and
Gray</label><mixed-citation>
Ludwig, P., Pinto, J. G., Hoepp, S. A., Fink, A. H., and Gray, S. L.: Secondary
Cyclogenesis along an Occluded Front Leading to Damaging Wind Gusts:
Windstorm Kyrill, January 2007, Mon. Weather Rev., 143, 1417–1437,
2015.</mixed-citation></ref>
      <ref id="bib1.bibx44"><label>Michalakes et al.(2015)</label><mixed-citation>
Michalakes, J., Govett, M., Benson, R., Black, T., Juang, H., Reinecke, A.,
Skamarock, B., Duda, M., Henderson, T., Madden, P., Mozdzynski, G., and
Vasic, R.: AVEC Report: NGGPS Level-1 Benchmarks and Software
Evaluation, Tech. Rep., NGGPS Dynamical Core Test Group, 2015.</mixed-citation></ref>
      <ref id="bib1.bibx45"><label>Miura et al.(2007)Miura, Satoh, Nasuno, Noda, and
Oouchi</label><mixed-citation>
Miura, H., Satoh, M., Nasuno, T., Noda, A. T., and Oouchi, K.: A
Madden-Julian Oscillation Event Realistically Simulated by a Global
Cloud-Resolving Model, Science, 318, 1763–1765, 2007.</mixed-citation></ref>
      <ref id="bib1.bibx46"><label>Miyamoto et al.(2013)</label><mixed-citation>Miyamoto, Y., Kajikawa, Y.,
Yoshida, R., Yamaura, T.,
Yashiro, H., and Tomita, H.: Deep moist atmospheric convection in a subkilometer global
simulation, Geophys. Res. Lett., 40, 4922–4926, <ext-link xlink:href="https://doi.org/10.1002/grl.50944" ext-link-type="DOI">10.1002/grl.50944</ext-link>, 2013.</mixed-citation></ref>
      <ref id="bib1.bibx47"><label>Natinal Research Council(2012)</label><mixed-citation>National Research Council:  A National Strategy for Advancing Climate
Modeling. Washington, DC: The National Academies Press,
<ext-link xlink:href="https://doi.org/10.17226/13430" ext-link-type="DOI">10.17226/13430</ext-link>, 2012.</mixed-citation></ref>
      <ref id="bib1.bibx48"><label>NVIDIA(2016)</label><mixed-citation>NVIDIA: NVIDIA TESLA P100 Technical Overview, Tech. Rep., available at: <uri>http://images.nvidia.com/content/tesla/pdf/nvidia-teslap100-techoverview.pdf</uri>
(last access: 3 April 2017), 2016.</mixed-citation></ref>
      <ref id="bib1.bibx49"><label>Pachauri and Meyer(2014)</label><mixed-citation>
Pachauri, R. K. and Meyer, L. A. (Eds.): Climate Change 2014: Synthesis Report.
Contribution of Working Groups I, II and III to the Fifth Assessment Report
of the Intergovtl Panel on Climate Change, p. 151, IPCC, Geneva, Switzerland,
2014.</mixed-citation></ref>
      <ref id="bib1.bibx50"><label>Palmer(2014)</label><mixed-citation>Palmer, T.: Climate forecasting: Build high-resolution global climate models,
Nature, 515, 338–339, <ext-link xlink:href="https://doi.org/10.1038/515338a" ext-link-type="DOI">10.1038/515338a</ext-link>, 2014.</mixed-citation></ref>
      <ref id="bib1.bibx51"><label>Park et al.(2013)Park, Skamarock, Klemp, Fowler, and
Duda</label><mixed-citation>
Park, S.-H., Skamarock, W. C., Klemp, J. B., Fowler, L. D., and Duda, M. G.:
Evaluation of Global Atmospheric Solvers Using<?pagebreak page1681?> Extensions of the Jablonowski
and Williamson Baroclinic Wave Test Case, Mon. Weather Rev., 141,
3116–3129, 2013.</mixed-citation></ref>
      <ref id="bib1.bibx52"><label>Ralph(1996)</label><mixed-citation>
Ralph, F. M.: Observations of 250-km-Wavelength Clear-Air Eddies and
750-km-Wavelength Mesocyclones Associated with a Synoptic-Scale Midlatitude
Cyclone, Mon. Weather Rev., 124, 1199–1210, 1996.</mixed-citation></ref>
      <ref id="bib1.bibx53"><label>Raschendorfer(2001)</label><mixed-citation>
Raschendorfer, M.: The new turbulence parameterization of
LM, Quarterly Report of the Operational
NWP-Models of the DWD, No. 19, 3–12 May, 1999.</mixed-citation></ref>
      <ref id="bib1.bibx54"><label>Reinhardt and Seifert(2006)</label><mixed-citation>Reinhardt, T. and Seifert, A.: A three-category ice scheme for LMK, 6 pp.,
available at: <uri>http://www.cosmo-model.org/content/model/documentation/newsLetters/newsLetter06/cnl6_reinhardt.pdf</uri>
(last access: 20 April 2018), 2006.</mixed-citation></ref>
      <ref id="bib1.bibx55"><label>Ricard et al.(2013)Ricard, Lac, Riette, Legrand, and
Mary</label><mixed-citation>Ricard, D., Lac, C., Riette, S., Legrand, R., and Mary, A.: Kinetic energy
spectra characteristics of two convection-permitting limited-area models
AROME and Meso-NH, Q. J. Roy. Meteorol. Soc.,
139, 1327–1341, <ext-link xlink:href="https://doi.org/10.1002/qj.2025" ext-link-type="DOI">10.1002/qj.2025</ext-link>, 2013.</mixed-citation></ref>
      <ref id="bib1.bibx56"><?xmltex \def\ref@label{{Richard et~al.(2007)Richard, Buzzi, and Z{\"{a}}ngl}}?><label>Richard et al.(2007)Richard, Buzzi, and Zängl</label><mixed-citation>Richard, E., Buzzi, A., and Zängl, G.: Quantitative precipitation
forecasting in the Alps: The advances achieved by the Mesoscale Alpine
Programme, Q. J. Roy. Meteorol. Soc., 133, 831–846, <ext-link xlink:href="https://doi.org/10.1002/qj.65" ext-link-type="DOI">10.1002/qj.65</ext-link>,
2007.</mixed-citation></ref>
      <ref id="bib1.bibx57"><label>Ritter and Geleyn(1992)</label><mixed-citation>
Ritter, B. and Geleyn, J.-F.: A comprehensive radiation scheme for numerical
weather prediction models with potential applications in climate simulations,
Mon. Weather Rev., 120, 303–325, 1992.</mixed-citation></ref>
      <ref id="bib1.bibx58"><label>Schneider et al.(2017)</label><mixed-citation>Schneider, T., Teixeira, J., Bretherton, C. S., Brient, F.,
Pressel, K. G., Schär, C., and Siebesma, A. P.:
Climate goals and computing the future of clouds, Nature
Clim. Change, 7, 3–5, <ext-link xlink:href="https://doi.org/10.0.4.14/nclimate3190" ext-link-type="DOI">10.0.4.14/nclimate3190</ext-link>, 2017.</mixed-citation></ref>
      <ref id="bib1.bibx59"><label>Shalf et al.(2011)Shalf, Dosanjh, and Morrison</label><mixed-citation>Shalf, J., Dosanjh, S., and Morrison, J.: Exascale Computing Technology
Challenges,  1–25, Springer Berlin Heidelberg, Berlin, Heidelberg,
<ext-link xlink:href="https://doi.org/10.1007/978-3-642-19328-6_1" ext-link-type="DOI">10.1007/978-3-642-19328-6_1</ext-link>,  2011.</mixed-citation></ref>
      <ref id="bib1.bibx60"><label>Skamarock et al.(2014)Skamarock, Park, Klemp, and
Snyder</label><mixed-citation>Skamarock, W. C., Park, S.-H., Klemp, J. B., and Snyder, C.: Atmospheric
Kinetic Energy Spectra from Global High-Resolution Nonhydrostatic
Simulations, J. Atmos. Sci., 71, 4369–4381, 2014.
 </mixed-citation></ref><?xmltex \hack{\newpage}?>
      <ref id="bib1.bibx61"><?xmltex \def\ref@label{{Steppeler et~al.(2002)Steppeler, Doms, Sch{\"{a}}ttler, Bitzer,
Gassmann, Damrath, and Gregoric}}?><label>Steppeler et al.(2002)Steppeler, Doms, Schättler, Bitzer,
Gassmann, Damrath, and Gregoric</label><mixed-citation>Steppeler, J., Doms, G., Schättler, U., Bitzer, H., Gassmann, A., Damrath,
U., and Gregoric, G.: Meso gamma scale forecasts using the nonhydrostatic
model LM, Meteorol. Atmos. Phys., 82, 75–96, <ext-link xlink:href="https://doi.org/10.1007/s00703-001-0592-9" ext-link-type="DOI">10.1007/s00703-001-0592-9</ext-link>, 2002.</mixed-citation></ref>
      <ref id="bib1.bibx62"><label>TOP500(2017)</label><mixed-citation>TOP500: Supercomputer Site, available at: <uri>http://www.top500.org</uri>
(last access: 20 April 2018), 2017.</mixed-citation></ref>
      <ref id="bib1.bibx63"><label>Ullrich et al.(2015)Ullrich, Reed, and Jablonowski</label><mixed-citation>Ullrich, P. A., Reed, K. A., and Jablonowski, C.: Analytical initial conditions
and an analysis of baroclinic instability waves in f- and, <inline-formula><mml:math id="M162" display="inline"><mml:mi mathvariant="italic">β</mml:mi></mml:math></inline-formula>-plane 3D
channel models, Q. J. Roy. Meteorol. Soc., 141,
2972–2988, 2015.</mixed-citation></ref>
      <ref id="bib1.bibx64"><label>Vetter(2001)</label><mixed-citation>
Vetter, J. S.: External Memory Algorithms and Data Structures: Dealing with
Massive Data, ACM Comput. Surv., 33, 209–271, 2001.</mixed-citation></ref>
      <ref id="bib1.bibx65"><label>Wicker and Skamarock(2002)</label><mixed-citation>
Wicker, L. J. and Skamarock, W. C.: Time-Splitting Methods for Elastic Models
Using Forward Time Schemes, Mon. Weather Rev., 130, 2088–2097, 2002.</mixed-citation></ref>
      <ref id="bib1.bibx66"><label>Williams et al.(2009)Williams, Waterman, and Patterson</label><mixed-citation>Williams, S., Waterman, A., and Patterson, D.: Roofline: An Insightful Visual
Performance Model for Multicore Architectures, Commun. ACM, 52, 65–76,
<ext-link xlink:href="https://doi.org/10.1145/1498765.1498785" ext-link-type="DOI">10.1145/1498765.1498785</ext-link>,  2009.</mixed-citation></ref>
      <ref id="bib1.bibx67"><label>Yang et al.(2016a)</label><mixed-citation>
Yang, C., Xue, W., Fu, H., You, H., Wang, X.,
Ao, Y., Liu, F., Gan, L., Xu, P., Wang, L.,
Yang, G., and Zheng, W.: 10M-core Scalable Fully-implicit Solver
for Nonhydrostatic
Atmospheric Dynamics, in: Proceedings of the International Conference for
High Performance Computing, Networking, Storage and Analysis, SC16,
6:1–6:12, IEEE Press, Piscataway, NJ, USA, 2016a.</mixed-citation></ref>
      <ref id="bib1.bibx68"><label>Yashiro et al.(2016)</label><mixed-citation>Yashiro, H., Terai, M., Yoshida, R., Iga, S.-I.,
Minami, K., and Tomita, H.: Performance Analysis and Optimization of
Nonhydrostatic
ICosahedral Atmospheric Model (NICAM) on the K Computer and TSUBAME2.5,
in: Proceedings of the Platform for Advanced Scientific Computing Conference
on ZZZ – PASC'16, ACM Press, <ext-link xlink:href="https://doi.org/10.1145/2929908.2929911" ext-link-type="DOI">10.1145/2929908.2929911</ext-link>, 2016.</mixed-citation></ref>

  </ref-list></back>
    <!--<article-title-html>Near-global climate simulation at 1&thinsp;km resolution: establishing a performance baseline on 4888&thinsp;GPUs with COSMO 5.0</article-title-html>
<abstract-html><p>The best hope for reducing long-standing global climate model biases is by
increasing resolution to the kilometer scale. Here we present results from an
ultrahigh-resolution non-hydrostatic climate model for a near-global setup
running on the full Piz Daint supercomputer on 4888&thinsp;GPUs (graphics
processing units). The dynamical core of the model has been completely
rewritten using a domain-specific language (DSL) for performance portability
across different hardware architectures. Physical parameterizations and
diagnostics have been ported using compiler directives. To our knowledge this
represents the first complete atmospheric model being run entirely on
accelerators on this scale. At a grid spacing of 930&thinsp;m (1.9&thinsp;km), we achieve
a simulation throughput of 0.043 (0.23) simulated years per day and an energy
consumption of 596&thinsp;MWh per simulated year. Furthermore, we propose a new
memory usage efficiency (MUE) metric that considers how efficiently the
memory bandwidth – the dominant bottleneck of climate codes – is being
used.</p></abstract-html>
<ref-html id="bib1.bib1"><label>Alverson et al.(2012)Alverson, Froese, Kaplan, and
Roweth</label><mixed-citation>
Alverson, B., Froese, E., Kaplan, L., and Roweth, D.: Cray XC Series Network,
Tech. Rep., available at: <a href="https://www.cray.com/sites/default/files/resources/CrayXCNetwork.pdf" target="_blank">https://www.cray.com/sites/default/files/resources/CrayXCNetwork.pdf</a> (last access: 3 April 2017), 2012.
</mixed-citation></ref-html>
<ref-html id="bib1.bib2"><label>Balaji et al.(2017)Balaji, Maisonnave, Zadeh, Lawrence, Biercamp,
Fladrich, Aloisio, Benson, Caubel, Durachta, Foujols, Lister, Mocavero,
Underwood, and Wright</label><mixed-citation>
Balaji, V., Maisonnave, E., Zadeh, N., Lawrence, B. N., Biercamp, J.,
Fladrich, U., Aloisio, G., Benson, R., Caubel, A., Durachta, J., Foujols,
M.-A., Lister, G., Mocavero, S., Underwood, S., and Wright, G.: CPMIP:
measurements of real computational performance of Earth system models in
CMIP6, Geosci. Model Dev., 10, 19–34, <a href="https://doi.org/10.5194/gmd-10-19-2017" target="_blank">https://doi.org/10.5194/gmd-10-19-2017</a>,
2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib3"><label>Baldauf et al.(2011)Baldauf, Seifert, Foerstner, Majewski,
Raschendorfer, and Reinhardt</label><mixed-citation>
Baldauf, M., Seifert, A., Foerstner, J., Majewski, D., Raschendorfer, M., and
Reinhardt, T.: Operational Convective-Scale Numerical Weather Prediction with
the COSMO Model: Description and Sensitivities, Mon. Weather Rev., 139,
3887–3905, 2011.
</mixed-citation></ref-html>
<ref-html id="bib1.bib4"><label>Ban et al.(2015)Ban, Schmidli, and Schär</label><mixed-citation>
Ban, N., Schmidli, J., and Schär, C.: Heavy precipitation in a changing
climate: Does short-term summer precipitation increase faster?, Geophys.
Res. Lett., 42, 1165–1172,   2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib5"><label>Benoit et al.(2002)</label><mixed-citation>
Benoit, R., Schär, C., Binder, P., Chamberland, S., Davies, H. C., Desgagné,
M., Girard, C., Keil, C., Kouwen, N., Lüthi, D., Maric, D., Müller, E.,
Pellerin, P., Schmidli, J., Schubiger, F., Schwierz, C., Sprenger, M.,
Walser, A., Willemse, S., Yu, W., and Zala, E.: The Real-Time Ultrafinescale
Forecast Support during the
Special Observing Period of the MAP, B. Am. Meteorol.
Soc., 83, 85–109, 2002.
</mixed-citation></ref-html>
<ref-html id="bib1.bib6"><label>Bony et al.(2015)</label><mixed-citation>
Bony, S.,  Stevens, B.,   Frierson, D. M. W.,   Jakob, C.,
Kageyama, M.,  Pincus, R.,   Shepherd, T. G.,   Sherwood, S. C.,
Siebesma, A. P.,  Sobel, A. H.,   Watanabe, M.,
and Webb, M. J.: Clouds, circulation and climate sensitivity, Nat. Geosci.,
8, 261–268, 2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib7"><label>Borkar and Chien(2011)</label><mixed-citation>
Borkar, S. and Chien, A. A.: The Future of Microprocessors, Commun. ACM, 54,
67–77, 2011.
</mixed-citation></ref-html>
<ref-html id="bib1.bib8"><label>Bott(1989)</label><mixed-citation>
Bott, A.: A positive definite advection scheme obtained by nonlinear
renormalization of the advective fluxes, Mon. Weather Rev., 117,
1006–1016, 1989.
</mixed-citation></ref-html>
<ref-html id="bib1.bib9"><label>Boucher et al.(2013)Boucher, Randall et al.</label><mixed-citation>
Boucher, O., Randall, D., Artaxo, P., Bretherton, C., Feingold, G., Forster, P.,
Kerminen, V.-M., Kondo, Y., Liao, H., Lohmann, U., Rasch, P., Satheesh, S. K.,
Sherwood, S., Stevens, B., and Zhang, X. Y.: Clouds and Aerosols. In: Climate
Change 2013: The Physical Science Basis. Contribution of Working Group I to
the Fifth Assessment Report of the Intergovernmental Panel on Climate Change,
edited by: Stocker, T. F., Qin, D., Plattner, G.-K., Tignor, M., Allen, S. K.,
Boschung, J., Nauels,
A., Xia, Y., Bex, V., and Midgley, P. M., Cambridge University
Press, Cambridge, United Kingdom and New York, NY, USA, 571–658,
<a href="https://doi.org/10.1017/CBO9781107415324.016" target="_blank">https://doi.org/10.1017/CBO9781107415324.016</a>, 2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib10"><label>Bretherton and Khairoutdinov(2015)</label><mixed-citation>
Bretherton, C. S. and Khairoutdinov, M. F.: Convective self-aggregation
feedbacks in near-global cloud-resolving simulations of an aquaplanet,
J. Adv. Model. Earth Syst., 7, 1765–1787, 2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib11"><label>CLM-Community(2017)</label><mixed-citation>
CLM-Community: Climate Limited-area Modelling Community,
available at: <a href="http://www.clm-community.eu/" target="_blank">http://www.clm-community.eu/</a>, last access: 3 April, 2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib12"><label>COSMO(2017)</label><mixed-citation>
COSMO: Consortium for Small-Scale Modeling,
available at: <a href="http://www.cosmo-model.org/" target="_blank">http://www.cosmo-model.org/</a>, last access: 4 April, 2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib13"><label>Davies et al.(2003)Davies, Staniforth, Wood, and
Thuburn</label><mixed-citation>
Davies, T., Staniforth, A., Wood, N., and Thuburn, J.: Validity of anelastic
and other equation sets as inferred from normal-mode analysis, Q.
J. Roy. Meteorol. Soc., 129, 2761–2775, 2003.
</mixed-citation></ref-html>
<ref-html id="bib1.bib14"><label>Davies et al.(2005)</label><mixed-citation>
Davies, T., Cullen, M. J., Malcolm, A. J., Mawson, M. H., Staniforth, A.,
White, A. A., and Wood, N.: A new dynamical core for the Met Office's
global and regional modelling of the atmosphere,  Q. J. Roy. Meteorol. Soc.,
131, 1759–1782, <a href="https://doi.org/10.1256/qj.04.101" target="_blank">https://doi.org/10.1256/qj.04.101</a>, 2005.
</mixed-citation></ref-html>
<ref-html id="bib1.bib15"><label>Deakin et al.(2016)Deakin, Price, Martineau, and
McIntosh-Smith</label><mixed-citation>
Deakin, T., Price, J., Martineau, M., and McIntosh-Smith, S.: GPU-STREAM v2.0:
Benchmarking the Achievable Memory Bandwidth of Many-Core Processors Across
Diverse Parallel Programming Models, 489–507, Springer, Cham, 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib16"><label>Doms and Schättler(1999)</label><mixed-citation>
Doms, G. and Schättler, U.: The nonhydrostatic limited-area model LM
(Lokal-Modell) of the DWD. Part I: Scientific documentation, Tech.
rep., German Weather Service (DWD), Offenbach, Germany,
available at: <a href="http://www.cosmo-model.org/" target="_blank">http://www.cosmo-model.org/</a> (last access: 19 March 2018), 1999.
</mixed-citation></ref-html>
<ref-html id="bib1.bib17"><label>Durran(2010)</label><mixed-citation>
Durran, D. R.: Numerical Methods for Fluid Dynamics with Applications to
Geophysics, Vol. 32 of Texts in Applied Mathematics, Springer, New
York, 2010.
</mixed-citation></ref-html>
<ref-html id="bib1.bib18"><label>ECMWF(2016)</label><mixed-citation>
ECMWF: IFS Documentation Part III: Dynamics and numerical procedures, European
Centre for Medium-Range Weather Forecasts, Shinfield Park, Reading, RG2 9AX,
England, 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib19"><label>Ewing and Wang(2001)</label><mixed-citation>
Ewing, R. E. and Wang, H.: A summary of numerical methods for time-dependent
advection-dominated partial differential equations, J. Comput.
Appl. Math., 128, 423–445, numerical Analysis 2000, Vol. VII:
Partial Differential Equations, 2001.
</mixed-citation></ref-html>
<ref-html id="bib1.bib20"><label>Eyring et al.(2016)</label><mixed-citation>
Eyring, V., Bony, S., Meehl, G. A., Senior, C. A., Stevens, B., Stouffer, R.
J., and Taylor, K. E.: Overview of the Coupled Model Intercomparison Project
Phase 6 (CMIP6) experimental design and organization, Geosci. Model Dev., 9,
1937–1958, <a href="https://doi.org/10.5194/gmd-9-1937-2016" target="_blank">https://doi.org/10.5194/gmd-9-1937-2016</a>, 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib21"><label>Fourestey et al.(2014)</label><mixed-citation>
Fourestey, G., Cumming, B., Gilly, L., and Schulthess, T. C.:
First Experiences With Validating and Using the Cray
Power Management Database Tool, CoRR, abs/1408.2657, available at: <a href="http://arxiv.org/abs/1408.2657" target="_blank">http://arxiv.org/abs/1408.2657</a>
(last access: 20 April 2018), 2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib22"><label>Fu et al.(2004)Fu, Niino, Kimura, and Kato</label><mixed-citation>
Fu, G., Niino, H., Kimura, R., and Kato, T.: Multiple Polar Mesocyclones over
the Japan Sea on 11 February 1997, Mon. Weather Rev., 132, 793–814,
2004.
</mixed-citation></ref-html>
<ref-html id="bib1.bib23"><label>Fuhrer et al.(2014)</label><mixed-citation>
Fuhrer, O., Osuna, C., Lapillonne, X., Gysi, T., Cumming, B., Bianco, M.,
Arteaga, A., and Schulthess, T. C.: Towards a performance portable, architecture agnostic
implementation strategy for weather and climate models, Supercomputing
frontiers and innovations, 1,
available at: <a href="http://superfri.org/superfri/article/view/17" target="_blank">http://superfri.org/superfri/article/view/17</a> (last access: 17 March 2018), 2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib24"><label>Gadde(2013)</label><mixed-citation>
Gadde, S.: Graph partitioning algorithms for minimizing inter-node
communication on a distributed system, Ph.D. thesis, The University of
Toledo, 2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib25"><label>Giraldo et al.(2013)Giraldo, Kelly, and Constantinescu</label><mixed-citation>
Giraldo, F. X., Kelly, J. F., and Constantinescu, E. M.: Implicit-Explicit
Formulations of a Three-Dimensional Nonhydrostatic Unified Model of the
Atmosphere (NUMA), SIAM J. Sci. Comp., 35, B1162–B1194,
<a href="https://doi.org/10.1137/120876034" target="_blank">https://doi.org/10.1137/120876034</a>,   2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib26"><label>Goodrich et al.(2010)Goodrich, Sitchinava, and Arge</label><mixed-citation>
Goodrich, M. T., Sitchinava, N., and Arge, L.: Parallel external memory graph
algorithms, 2010 IEEE International Symposium on Parallel and Distributed
Processing (IPDPS), 00, 1–11, 2010.
</mixed-citation></ref-html>
<ref-html id="bib1.bib27"><label>Gysi et al.(2015a)Gysi, Grosser, and Hoefler</label><mixed-citation>
Gysi, T., Grosser, T., and Hoefler, T.: MODESTO: Data-centric Analytic
Optimization of Complex Stencil Programs on Heterogeneous Architectures, in:
Proceedings of the 29th ACM on International Conference on Supercomputing,
ICS '15,  177–186, ACM, New York, NY, USA, 2015a.
</mixed-citation></ref-html>
<ref-html id="bib1.bib28"><label>Gysi et al.(2015b)Gysi, Osuna, Fuhrer, Bianco, and
Schulthess</label><mixed-citation>
Gysi, T., Osuna, C., Fuhrer, O., Bianco, M., and Schulthess, T. C.: STELLA: A
Domain-specific Tool for Structured Grid Methods in Weather and Climate
Models, in: Proc. of the Intl. Conf. for High Performance Computing,
Networking, Storage and Analysis, SC '15,  41:1–41:12, ACM, New York, NY,
USA, 2015b.
</mixed-citation></ref-html>
<ref-html id="bib1.bib29"><label>Heise et al.(2006)Heise, Ritter, and Schrodin</label><mixed-citation>
Heise, E., Ritter, B., and Schrodin, R.: Operational implementation of the
multilayer soil model, COSMO Tech. Rep., No. 9, Tech. rep., COSMO, 2006.
</mixed-citation></ref-html>
<ref-html id="bib1.bib30"><label>Hong and Kung(1981)</label><mixed-citation>
Hong, J.-W. and Kung, H. T.: I/O Complexity: The Red-blue Pebble Game, in:
Proceedings of the Thirteenth Annual ACM Symposium on Theory of Computing,
STOC '81, 326–333, ACM, New York, NY, USA, 1981.
</mixed-citation></ref-html>
<ref-html id="bib1.bib31"><label>Hope(2015)</label><mixed-citation>
Hope, C.: The $10 trillion value of better information about the transient
climate response, Philos. T. Roy. Soc. A, 373, 2054, <a href="https://doi.org/10.1098/rsta.2014.0429" target="_blank">https://doi.org/10.1098/rsta.2014.0429</a>,
2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib32"><label>Hutter et al.(2014)Hutter, Iannuzzi, Schiffmann, and
VandeVondele</label><mixed-citation>
Hutter, J., Iannuzzi, M., Schiffmann, F., and VandeVondele, J.: CP2K: atomistic
simulations of condensed matter systems, Wiley Interdisciplinary Reviews:
Computational Molecular Science, 4, 15–25, 2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib33"><label>Jablonowski and Williamson(2006)</label><mixed-citation>
Jablonowski, C. and Williamson, D. L.: A baroclinic instability test case for
atmospheric model dynamical cores, Q. J. Roy.
Meteorol. Soc., 132, 2943–2975, 2006.
</mixed-citation></ref-html>
<ref-html id="bib1.bib34"><label>Karypis and Kumar(2009)</label><mixed-citation>
Karypis, G. and Kumar, V.: MeTis: Unstructured Graph Partitioning and Sparse
Matrix Ordering System, available at: <a href="http://www.cs.umn.edu/~metis" target="_blank">http://www.cs.umn.edu/~metis</a> (last access: 17 March 2018), 2009.
</mixed-citation></ref-html>
<ref-html id="bib1.bib35"><label>Kendon et al.(2014)</label><mixed-citation>
Kendon, E. J., Roberts, N. M., Fowler, H. J., Roberts,
M. J., Chan, S. C., and Senior, C. A.: Heavier summer downpours with climate change revealed by
weather forecast resolution model, Nat. Clim. Change, 4, 570–576, 2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib36"><label>Kwok and Ahmad(1999)</label><mixed-citation>
Kwok, Y.-K. and Ahmad, I.: Static Scheduling Algorithms for Allocating Directed
Task Graphs to Multiprocessors, ACM Comput. Surv., 31, 406–471, 1999.
</mixed-citation></ref-html>
<ref-html id="bib1.bib37"><label>Lapillonne and Fuhrer(2014)</label><mixed-citation>
Lapillonne, X. and Fuhrer, O.: Using Compiler Directives to Port Large
Scientific Applications to GPUs: An Example from Atmospheric Science,
Parallel Processing Letters, 24, 1450003, <a href="https://doi.org/10.1142/S0129626414500030" target="_blank">https://doi.org/10.1142/S0129626414500030</a>, 2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib38"><label>Lapillonne et al.(2016)</label><mixed-citation>
Lapillonne, X., Fuhrer, O., Spörri, P., Osuna, C.,
Walser, A., Arteaga, A., Gysi, T., Rüdisühli, S., Osterried, K., and
Schulthess, T.: Operational numerical weather prediction on a
GPU-accelerated cluster supercomputer, in: EGU General Assembly Conference
Abstracts, Vol. 18 of  EGU General Assembly Conference Abstracts, p. 13554, 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib39"><label>Lee et al.(2010)</label><mixed-citation>
Lee, V. W., Kim, C., Chhugani, J., Deisher, M.,
Kim, D., Nguyen, A. D., Satish, N., Smelyanskiy,
M., Chennupaty, S., Hammarlund, P., Singhal, R., and
Dubey, P.: Debunking the 100X GPU vs. CPU Myth: An Evaluation of
Throughput Computing on CPU and GPU, SIGARCH Comput. Archit. News, 38,
451–460, 2010.
</mixed-citation></ref-html>
<ref-html id="bib1.bib40"><label>Leutwyler et al.(2016)Leutwyler, Fuhrer, Lapillonne, Lüthi, and
Schär</label><mixed-citation>
Leutwyler, D., Fuhrer, O., Lapillonne, X., Lüthi, D., and Schär, C.: Towards
European-scale convection-resolving climate simulations with GPUs: a study
with COSMO 4.19, Geosci. Model Dev., 9, 3393–3412,
<a href="https://doi.org/10.5194/gmd-9-3393-2016" target="_blank">https://doi.org/10.5194/gmd-9-3393-2016</a>, 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib41"><label>Leutwyler et al.(2017)Leutwyler, Lüthi, Ban, Fuhrer, and
Schär</label><mixed-citation>
Leutwyler, D., Lüthi, D., Ban, N., Fuhrer, O., and Schär, C.: Evaluation of
the convection-resolving climate modeling approach on continental scales,
J. Geophys. Res.-Atmos., 122, 5237–5258,
<a href="https://doi.org/10.1002/2016JD026013" target="_blank">https://doi.org/10.1002/2016JD026013</a>,  2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib42"><label>Liu et al.(2014)Liu, Zhang, and Zhang</label><mixed-citation>
Liu, L., Zhang, T., and Zhang, J.: DAG Based Multipath Routing Algorithm for
Load Balancing in Machine-to-Machine Networks, Int. J. Distrib. Sens. N.,
10, 457962, <a href="https://doi.org/10.1155/2014/457962" target="_blank">https://doi.org/10.1155/2014/457962</a>, 2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib43"><label>Ludwig et al.(2015)Ludwig, Pinto, Hoepp, Fink, and
Gray</label><mixed-citation>
Ludwig, P., Pinto, J. G., Hoepp, S. A., Fink, A. H., and Gray, S. L.: Secondary
Cyclogenesis along an Occluded Front Leading to Damaging Wind Gusts:
Windstorm Kyrill, January 2007, Mon. Weather Rev., 143, 1417–1437,
2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib44"><label>Michalakes et al.(2015)</label><mixed-citation>
Michalakes, J., Govett, M., Benson, R., Black, T., Juang, H., Reinecke, A.,
Skamarock, B., Duda, M., Henderson, T., Madden, P., Mozdzynski, G., and
Vasic, R.: AVEC Report: NGGPS Level-1 Benchmarks and Software
Evaluation, Tech. Rep., NGGPS Dynamical Core Test Group, 2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib45"><label>Miura et al.(2007)Miura, Satoh, Nasuno, Noda, and
Oouchi</label><mixed-citation>
Miura, H., Satoh, M., Nasuno, T., Noda, A. T., and Oouchi, K.: A
Madden-Julian Oscillation Event Realistically Simulated by a Global
Cloud-Resolving Model, Science, 318, 1763–1765, 2007.
</mixed-citation></ref-html>
<ref-html id="bib1.bib46"><label>Miyamoto et al.(2013)</label><mixed-citation>
Miyamoto, Y., Kajikawa, Y.,
Yoshida, R., Yamaura, T.,
Yashiro, H., and Tomita, H.: Deep moist atmospheric convection in a subkilometer global
simulation, Geophys. Res. Lett., 40, 4922–4926, <a href="https://doi.org/10.1002/grl.50944" target="_blank">https://doi.org/10.1002/grl.50944</a>, 2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib47"><label>Natinal Research Council(2012)</label><mixed-citation>
National Research Council:  A National Strategy for Advancing Climate
Modeling. Washington, DC: The National Academies Press,
<a href="https://doi.org/10.17226/13430" target="_blank">https://doi.org/10.17226/13430</a>, 2012.
</mixed-citation></ref-html>
<ref-html id="bib1.bib48"><label>NVIDIA(2016)</label><mixed-citation>
NVIDIA: NVIDIA TESLA P100 Technical Overview, Tech. Rep., available at: <a href="http://images.nvidia.com/content/tesla/pdf/nvidia-teslap100-techoverview.pdf" target="_blank">http://images.nvidia.com/content/tesla/pdf/nvidia-teslap100-techoverview.pdf</a>
(last access: 3 April 2017), 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib49"><label>Pachauri and Meyer(2014)</label><mixed-citation>
Pachauri, R. K. and Meyer, L. A. (Eds.): Climate Change 2014: Synthesis Report.
Contribution of Working Groups I, II and III to the Fifth Assessment Report
of the Intergovtl Panel on Climate Change, p. 151, IPCC, Geneva, Switzerland,
2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib50"><label>Palmer(2014)</label><mixed-citation>
Palmer, T.: Climate forecasting: Build high-resolution global climate models,
Nature, 515, 338–339, <a href="https://doi.org/10.1038/515338a" target="_blank">https://doi.org/10.1038/515338a</a>, 2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib51"><label>Park et al.(2013)Park, Skamarock, Klemp, Fowler, and
Duda</label><mixed-citation>
Park, S.-H., Skamarock, W. C., Klemp, J. B., Fowler, L. D., and Duda, M. G.:
Evaluation of Global Atmospheric Solvers Using Extensions of the Jablonowski
and Williamson Baroclinic Wave Test Case, Mon. Weather Rev., 141,
3116–3129, 2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib52"><label>Ralph(1996)</label><mixed-citation>
Ralph, F. M.: Observations of 250-km-Wavelength Clear-Air Eddies and
750-km-Wavelength Mesocyclones Associated with a Synoptic-Scale Midlatitude
Cyclone, Mon. Weather Rev., 124, 1199–1210, 1996.
</mixed-citation></ref-html>
<ref-html id="bib1.bib53"><label>Raschendorfer(2001)</label><mixed-citation>
Raschendorfer, M.: The new turbulence parameterization of
LM, Quarterly Report of the Operational
NWP-Models of the DWD, No. 19, 3–12 May, 1999.
</mixed-citation></ref-html>
<ref-html id="bib1.bib54"><label>Reinhardt and Seifert(2006)</label><mixed-citation>
Reinhardt, T. and Seifert, A.: A three-category ice scheme for LMK, 6 pp.,
available at: <a href="http://www.cosmo-model.org/content/model/documentation/newsLetters/newsLetter06/cnl6_reinhardt.pdf" target="_blank">http://www.cosmo-model.org/content/model/documentation/newsLetters/newsLetter06/cnl6_reinhardt.pdf</a>
(last access: 20 April 2018), 2006.
</mixed-citation></ref-html>
<ref-html id="bib1.bib55"><label>Ricard et al.(2013)Ricard, Lac, Riette, Legrand, and
Mary</label><mixed-citation>
Ricard, D., Lac, C., Riette, S., Legrand, R., and Mary, A.: Kinetic energy
spectra characteristics of two convection-permitting limited-area models
AROME and Meso-NH, Q. J. Roy. Meteorol. Soc.,
139, 1327–1341, <a href="https://doi.org/10.1002/qj.2025" target="_blank">https://doi.org/10.1002/qj.2025</a>, 2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib56"><label>Richard et al.(2007)Richard, Buzzi, and Zängl</label><mixed-citation>
Richard, E., Buzzi, A., and Zängl, G.: Quantitative precipitation
forecasting in the Alps: The advances achieved by the Mesoscale Alpine
Programme, Q. J. Roy. Meteorol. Soc., 133, 831–846, <a href="https://doi.org/10.1002/qj.65" target="_blank">https://doi.org/10.1002/qj.65</a>,
2007.
</mixed-citation></ref-html>
<ref-html id="bib1.bib57"><label>Ritter and Geleyn(1992)</label><mixed-citation>
Ritter, B. and Geleyn, J.-F.: A comprehensive radiation scheme for numerical
weather prediction models with potential applications in climate simulations,
Mon. Weather Rev., 120, 303–325, 1992.
</mixed-citation></ref-html>
<ref-html id="bib1.bib58"><label>Schneider et al.(2017)</label><mixed-citation>
Schneider, T., Teixeira, J., Bretherton, C. S., Brient, F.,
Pressel, K. G., Schär, C., and Siebesma, A. P.:
Climate goals and computing the future of clouds, Nature
Clim. Change, 7, 3–5, <a href="https://doi.org/10.0.4.14/nclimate3190" target="_blank">https://doi.org/10.0.4.14/nclimate3190</a>, 2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib59"><label>Shalf et al.(2011)Shalf, Dosanjh, and Morrison</label><mixed-citation>
Shalf, J., Dosanjh, S., and Morrison, J.: Exascale Computing Technology
Challenges,  1–25, Springer Berlin Heidelberg, Berlin, Heidelberg,
<a href="https://doi.org/10.1007/978-3-642-19328-6_1" target="_blank">https://doi.org/10.1007/978-3-642-19328-6_1</a>,  2011.
</mixed-citation></ref-html>
<ref-html id="bib1.bib60"><label>Skamarock et al.(2014)Skamarock, Park, Klemp, and
Snyder</label><mixed-citation>
Skamarock, W. C., Park, S.-H., Klemp, J. B., and Snyder, C.: Atmospheric
Kinetic Energy Spectra from Global High-Resolution Nonhydrostatic
Simulations, J. Atmos. Sci., 71, 4369–4381, 2014.

</mixed-citation></ref-html>
<ref-html id="bib1.bib61"><label>Steppeler et al.(2002)Steppeler, Doms, Schättler, Bitzer,
Gassmann, Damrath, and Gregoric</label><mixed-citation>
Steppeler, J., Doms, G., Schättler, U., Bitzer, H., Gassmann, A., Damrath,
U., and Gregoric, G.: Meso gamma scale forecasts using the nonhydrostatic
model LM, Meteorol. Atmos. Phys., 82, 75–96, <a href="https://doi.org/10.1007/s00703-001-0592-9" target="_blank">https://doi.org/10.1007/s00703-001-0592-9</a>, 2002.
</mixed-citation></ref-html>
<ref-html id="bib1.bib62"><label>TOP500(2017)</label><mixed-citation>
TOP500: Supercomputer Site, available at: <a href="http://www.top500.org" target="_blank">http://www.top500.org</a>
(last access: 20 April 2018), 2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib63"><label>Ullrich et al.(2015)Ullrich, Reed, and Jablonowski</label><mixed-citation>
Ullrich, P. A., Reed, K. A., and Jablonowski, C.: Analytical initial conditions
and an analysis of baroclinic instability waves in f- and, <i>β</i>-plane 3D
channel models, Q. J. Roy. Meteorol. Soc., 141,
2972–2988, 2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib64"><label>Vetter(2001)</label><mixed-citation>
Vetter, J. S.: External Memory Algorithms and Data Structures: Dealing with
Massive Data, ACM Comput. Surv., 33, 209–271, 2001.
</mixed-citation></ref-html>
<ref-html id="bib1.bib65"><label>Wicker and Skamarock(2002)</label><mixed-citation>
Wicker, L. J. and Skamarock, W. C.: Time-Splitting Methods for Elastic Models
Using Forward Time Schemes, Mon. Weather Rev., 130, 2088–2097, 2002.
</mixed-citation></ref-html>
<ref-html id="bib1.bib66"><label>Williams et al.(2009)Williams, Waterman, and Patterson</label><mixed-citation>
Williams, S., Waterman, A., and Patterson, D.: Roofline: An Insightful Visual
Performance Model for Multicore Architectures, Commun. ACM, 52, 65–76,
<a href="https://doi.org/10.1145/1498765.1498785" target="_blank">https://doi.org/10.1145/1498765.1498785</a>,  2009.
</mixed-citation></ref-html>
<ref-html id="bib1.bib67"><label>Yang et al.(2016a)</label><mixed-citation>
Yang, C., Xue, W., Fu, H., You, H., Wang, X.,
Ao, Y., Liu, F., Gan, L., Xu, P., Wang, L.,
Yang, G., and Zheng, W.: 10M-core Scalable Fully-implicit Solver
for Nonhydrostatic
Atmospheric Dynamics, in: Proceedings of the International Conference for
High Performance Computing, Networking, Storage and Analysis, SC16,
6:1–6:12, IEEE Press, Piscataway, NJ, USA, 2016a.
</mixed-citation></ref-html>
<ref-html id="bib1.bib68"><label>Yashiro et al.(2016)</label><mixed-citation>
Yashiro, H., Terai, M., Yoshida, R., Iga, S.-I.,
Minami, K., and Tomita, H.: Performance Analysis and Optimization of
Nonhydrostatic
ICosahedral Atmospheric Model (NICAM) on the K Computer and TSUBAME2.5,
in: Proceedings of the Platform for Advanced Scientific Computing Conference
on ZZZ – PASC'16, ACM Press, <a href="https://doi.org/10.1145/2929908.2929911" target="_blank">https://doi.org/10.1145/2929908.2929911</a>, 2016.
</mixed-citation></ref-html>--></article>
