<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing with OASIS Tables v3.0 20080202//EN" "journalpub-oasis3.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:oasis="http://docs.oasis-open.org/ns/oasis-exchange/table" xml:lang="en" dtd-version="3.0"><?xmltex \makeatother\@nolinetrue\makeatletter?>
  <front>
    <journal-meta><journal-id journal-id-type="publisher">GMD</journal-id><journal-title-group>
    <journal-title>Geoscientific Model Development</journal-title>
    <abbrev-journal-title abbrev-type="publisher">GMD</abbrev-journal-title><abbrev-journal-title abbrev-type="nlm-ta">Geosci. Model Dev.</abbrev-journal-title>
  </journal-title-group><issn pub-type="epub">1991-9603</issn><publisher>
    <publisher-name>Copernicus Publications</publisher-name>
    <publisher-loc>Göttingen, Germany</publisher-loc>
  </publisher></journal-meta>
    <article-meta>
      <article-id pub-id-type="doi">10.5194/gmd-11-3659-2018</article-id><title-group><article-title>Requirements for a global data infrastructure in support of CMIP6</article-title><alt-title>Global data infrastructure in support of CMIP6</alt-title>
      </title-group><?xmltex \runningtitle{Global data infrastructure in support of CMIP6}?><?xmltex \runningauthor{V. Balaji et al.}?>
      <contrib-group>
        <contrib contrib-type="author" corresp="yes" rid="aff1 aff2">
          <name><surname>Balaji</surname><given-names>Venkatramani</given-names></name>
          <email>balaji@princeton.edu</email>
        <ext-link>https://orcid.org/0000-0001-7561-5438</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff3">
          <name><surname>Taylor</surname><given-names>Karl E.</given-names></name>
          
        <ext-link>https://orcid.org/0000-0002-6491-2135</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff4">
          <name><surname>Juckes</surname><given-names>Martin</given-names></name>
          
        <ext-link>https://orcid.org/0000-0003-1770-2132</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff5 aff4">
          <name><surname>Lawrence</surname><given-names>Bryan N.</given-names></name>
          
        <ext-link>https://orcid.org/0000-0001-9262-7860</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff3">
          <name><surname>Durack</surname><given-names>Paul J.</given-names></name>
          
        <ext-link>https://orcid.org/0000-0003-2835-1438</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff6">
          <name><surname>Lautenschlager</surname><given-names>Michael</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff7 aff2">
          <name><surname>Blanton</surname><given-names>Chris</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff8">
          <name><surname>Cinquini</surname><given-names>Luca</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff9">
          <name><surname>Denvil</surname><given-names>Sébastien</given-names></name>
          
        <ext-link>https://orcid.org/0000-0002-6715-3533</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff10">
          <name><surname>Elkington</surname><given-names>Mark</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff9">
          <name><surname>Guglielmo</surname><given-names>Francesca</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff9 aff4">
          <name><surname>Guilyardi</surname><given-names>Eric</given-names></name>
          
        <ext-link>https://orcid.org/0000-0002-2255-8625</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff4">
          <name><surname>Hassell</surname><given-names>David</given-names></name>
          
        <ext-link>https://orcid.org/0000-0001-5106-7502</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff11">
          <name><surname>Kharin</surname><given-names>Slava</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff6">
          <name><surname>Kindermann</surname><given-names>Stefan</given-names></name>
          
        <ext-link>https://orcid.org/0000-0001-9335-1093</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff1 aff2">
          <name><surname>Nikonov</surname><given-names>Sergey</given-names></name>
          
        </contrib>
        <contrib contrib-type="author" corresp="no" rid="aff7 aff2">
          <name><surname>Radhakrishnan</surname><given-names>Aparna</given-names></name>
          
        <ext-link>https://orcid.org/0000-0002-2843-931X</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff6">
          <name><surname>Stockhause</surname><given-names>Martina</given-names></name>
          
        <ext-link>https://orcid.org/0000-0001-6636-4972</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff6">
          <name><surname>Weigel</surname><given-names>Tobias</given-names></name>
          
        <ext-link>https://orcid.org/0000-0002-4040-0215</ext-link></contrib>
        <contrib contrib-type="author" corresp="no" rid="aff3">
          <name><surname>Williams</surname><given-names>Dean</given-names></name>
          
        </contrib>
        <aff id="aff1"><label>1</label><institution>Princeton University, Cooperative Institute of Climate
Science, Princeton, NJ 08540, USA</institution>
        </aff>
        <aff id="aff2"><label>2</label><institution>NOAA/Geophysical Fluid Dynamics Laboratory, Princeton, NJ 08540,
USA</institution>
        </aff>
        <aff id="aff3"><label>3</label><institution>PCMDI, Lawrence Livermore National Laboratory, Livermore, CA 94550, USA</institution>
        </aff>
        <aff id="aff4"><label>4</label><institution>Science and Technology Facilities Council, Abingdon, UK</institution>
        </aff>
        <aff id="aff5"><label>5</label><institution>National Centre for Atmospheric Science, University of
Reading, Reading, UK</institution>
        </aff>
        <aff id="aff6"><label>6</label><institution>Deutsches KlimaRechenZentrum GmbH, Hamburg, Germany</institution>
        </aff>
        <aff id="aff7"><label>7</label><institution>Engility Corporation, NJ 08540, USA</institution>
        </aff>
        <aff id="aff8"><label>8</label><institution>Jet Propulsion Laboratory (JPL), 4800 Oak Grove Drive,
Pasadena, CA 91109, USA</institution>
        </aff>
        <aff id="aff9"><label>9</label><institution>Institut Pierre Simon Laplace, CNRS/UPMC, Paris, France</institution>
        </aff>
        <aff id="aff10"><label>10</label><institution>Met Office, FitzRoy Road, Exeter, EX1 3PB, UK</institution>
        </aff>
        <aff id="aff11"><label>11</label><institution>Canadian Centre for Climate Modelling and Analysis,
Atmospheric Environment Service, <?xmltex \hack{\newline}?>University of Victoria, Victoria, BC, Canada</institution>
        </aff>
      </contrib-group>
      <author-notes><corresp id="corr1">Venkatramani Balaji (balaji@princeton.edu)</corresp></author-notes><pub-date><day>11</day><month>September</month><year>2018</year></pub-date>
      
      <volume>11</volume>
      <issue>9</issue>
      <fpage>3659</fpage><lpage>3680</lpage>
      <history>
        <date date-type="received"><day>21</day><month>February</month><year>2018</year></date>
           <date date-type="rev-request"><day>22</day><month>March</month><year>2018</year></date>
           <date date-type="rev-recd"><day>26</day><month>July</month><year>2018</year></date>
           <date date-type="accepted"><day>9</day><month>August</month><year>2018</year></date>
      </history>
      <permissions>
        
        
      <license license-type="open-access"><license-p>This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this licence, visit <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link></license-p></license></permissions><self-uri xlink:href="https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018.html">This article is available from https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018.html</self-uri><self-uri xlink:href="https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018.pdf">The full text article is available as a PDF file from https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018.pdf</self-uri>
      <abstract>
    <p id="d1e326">The World Climate Research Programme (WCRP)'s Working Group on Climate
Modelling (WGCM) Infrastructure Panel (WIP) was formed in 2014 in response to
the explosive growth in size and complexity of Coupled Model Intercomparison
Projects (CMIPs) between CMIP3 (2005–2006) and CMIP5 (2011–2012). This
article presents the WIP recommendations for the global data infrastructure
needed to support CMIP design, future growth, and evolution. Developed in
close coordination with those who build and run the existing infrastructure
(the Earth System Grid Federation; ESGF), the recommendations are based on
several principles beginning with the need to separate requirements,
implementation, and operations. Other important principles include the
consideration of the diversity of community needs around data – a
data ecosystem – the importance of provenance, the need for
automation, and the obligation to measure costs and benefits.</p>
    <p id="d1e329">This paper concentrates on requirements, recognizing the diversity
of communities involved (modelers, analysts, software developers,
and downstream users). Such requirements include the need for
scientific reproducibility and accountability alongside the need to
record and track data usage. One key element is to generate a
dataset-centric rather than system-centric focus, with an aim to
making the infrastructure less prone to systemic failure.</p>
    <p id="d1e332">With these overarching principles and requirements, the WIP has
produced a set of position papers, which are summarized in the
latter pages of this document. They provide specifications for
managing and delivering model output, including strategies for
replication and versioning, licensing, data quality assurance,
citation, long-term archiving, and dataset tracking. They also
describe a new and more formal approach for specifying what data,
and associated metadata, should be saved, which enables future data
volumes to be estimated, particularly for well-defined projects such
as CMIP6.</p>
    <p id="d1e335">The paper concludes with a future facing consideration of the global
data infrastructure evolution that follows from the blurring of
boundaries between climate and weather, and the changing nature of
published scientific results in the digital age.</p>
  </abstract>
    </article-meta>
  </front>
<body>
      

<?pagebreak page3660?><sec id="Ch1.S1" sec-type="intro">
  <title>Introduction</title>
      <p id="d1e345">CMIP6 <xref ref-type="bibr" rid="bib1.bibx14" id="paren.1"/>, the latest Coupled Model Intercomparison
Project (CMIP), can trace its genealogy back to the “Charney report”
<xref ref-type="bibr" rid="bib1.bibx8" id="paren.2"/>. This seminal report on the links between
<inline-formula><mml:math id="M1" display="inline"><mml:mrow class="chem"><mml:msub><mml:mi mathvariant="normal">CO</mml:mi><mml:mn mathvariant="normal">2</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> and climate was an authoritative summary of the state of the
science at the time and produced findings that have stood the test of time
<xref ref-type="bibr" rid="bib1.bibx6" id="paren.3"/>. It is often noted <xref ref-type="bibr" rid="bib1.bibx2" id="paren.4"><named-content content-type="pre">e.g</named-content></xref> that the range and uncertainty bounds on
equilibrium climate sensitivity generated in this report have not
fundamentally changed, despite the enormous increase in resources devoted to
analysing the problem in decades since <xref ref-type="bibr" rid="bib1.bibx28" id="paren.5"><named-content content-type="pre">e.g</named-content></xref></p>
      <p id="d1e378">Beyond its enduring findings on climate sensitivity, the Charney report also
gave rise to a methodology for the treatment of uncertainties and gaps in
understanding, which has been equally influential, and is in fact the basis
of CMIP itself. The report can be seen as one of the first uses of the
“multi-model ensemble”. At the time, there were two models available
representing the equilibrium response of the climate system to a change in
<inline-formula><mml:math id="M2" display="inline"><mml:mrow class="chem"><mml:msub><mml:mi mathvariant="normal">CO</mml:mi><mml:mn mathvariant="normal">2</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> forcing, one from Syukuro Manabe's group at NOAA's Geophysical
Fluid Dynamics Laboratory (NOAA-GFDL) and the other from James Hansen's group
at NASA's Goddard Institute for Space Studies (NASA-GISS). Then as now, these
groups marshalled vast state-of-the-art computing and data resources to run
very challenging simulations of the Earth system. The report's results were
based on an ensemble of three runs from the Manabe group <xref ref-type="bibr" rid="bib1.bibx32" id="paren.6"><named-content content-type="pre">e.g.</named-content></xref>
and two from the Hansen group <xref ref-type="bibr" rid="bib1.bibx23" id="paren.7"><named-content content-type="pre">e.g.</named-content></xref>.</p>
      <p id="d1e402">The Atmospheric Model Intercomparison Project <xref ref-type="bibr" rid="bib1.bibx17" id="paren.8"><named-content content-type="pre">AMIP:</named-content></xref>
was one of the first systematic cross-model comparisons open to anyone who
wished to participate. By the time of the Intergovernmental Panel on Climate
Change (IPCC)'s First Assessment Report (FAR) in 1990
<xref ref-type="bibr" rid="bib1.bibx25" id="paren.9"/>, the process had been formalized. At this stage,
there were five models participating in the exercise, and some of what is now
called the “Diagnosis, Evaluation, and Characterization of Klima”
<xref ref-type="bibr" rid="bib1.bibx14" id="paren.10"><named-content content-type="pre">DECK, see</named-content></xref> experiments<fn id="Ch1.Footn1"><p id="d1e418">“Klima” is
German for “climate”.</p></fn> had been standardized (AMIP, a preindustrial
control, 1 % year<inline-formula><mml:math id="M3" display="inline"><mml:msup><mml:mi/><mml:mrow><mml:mo>-</mml:mo><mml:mn mathvariant="normal">1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> <inline-formula><mml:math id="M4" display="inline"><mml:mrow class="chem"><mml:msub><mml:mi mathvariant="normal">CO</mml:mi><mml:mn mathvariant="normal">2</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> increase to doubling, etc.). The future
“scenarios” had emerged as well, for a total of five different experimental
protocols. Fast-forwarding to today, CMIP6 expects more than 100
models<fn id="Ch1.Footn2"><p id="d1e446"><uri>https://rawgit.com/WCRP-CMIP/CMIP6_CVs/master/src/CMIP6_source_id.html</uri>
(last access: 17 August 2018)</p></fn> from more than 40 modelling
centres<fn id="Ch1.Footn3"><p id="d1e452"><uri>https://rawgit.com/WCRP-CMIP/CMIP6_CVs/master/src/CMIP6_institution_id.html</uri>
(last access: 17 August 2018)</p></fn> <xref ref-type="bibr" rid="bib1.bibx8" id="paren.11"><named-content content-type="pre">in 27 countries, a stark contrast to
the US monopoly in</named-content></xref> to participate in the DECK and
historical experiments <xref ref-type="bibr" rid="bib1.bibx14" id="paren.12"><named-content content-type="pre">Table 2 of</named-content></xref>, and some
subset of these to participate in one or more of the 23 MIPs endorsed by the
CMIP Panel <xref ref-type="bibr" rid="bib1.bibx14" id="paren.13"><named-content content-type="pre">Table 3 of</named-content><named-content content-type="post">originally 21 with two new MIPs more recently
endorsed</named-content></xref>. The MIPs call for 287
experiments<fn id="Ch1.Footn4"><p id="d1e476"><uri>https://rawgit.com/WCRP-CMIP/CMIP6_CVs/master/src/CMIP6_experiment_id.html</uri>
(last access: 17 August 2018)</p></fn>, a considerable expansion over CMIP5.</p>
      <p id="d1e482">Alongside the experiments themselves is the “data
request”<fn id="Ch1.Footn5"><p id="d1e485"><uri>http://clipc-services.ceda.ac.uk/dreq/index.html</uri> <?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> which defines, for each CMIP experiment, what output
each model should provide for analysis. The complexity of this data request
has also grown tremendously over the CMIP era. A typical dataset from the FAR
archive (from the GFDL R15
model<fn id="Ch1.Footn6"><p id="d1e493"><uri>https://cera-www.dkrz.de/WDCC/ui/cerasearch/entry?acronym=IPCC_DDC_FAR_GFDL_R15TRCT_D</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>) lists climatologies and time series of a few
basic climate variables such as surface air temperature, and the dataset size
is about 200 MB. The CMIP6 data request (Juckes et al., 2015) lists
literally thousands of variables, from eight modelling “realms” (e.g.
atmosphere, ocean, land, atmospheric chemistry, land ice, ocean
biogeochemistry, and sea ice) from the hundreds of experiments mentioned
above. This growth in complexity is testament to the modern understanding of
many physical, chemical, and biological processes which were simply absent
from models of the Charney report era.</p>
      <p id="d1e501">The simulation output is now a primary scientific resource for researchers
the world over, rivaling the volume of observed weather and climate data from
the global array of sensors and satellites <xref ref-type="bibr" rid="bib1.bibx35" id="paren.14"/>.
Climate science and observed and simulated climate data have
now become primary elements in the “vast machine” <xref ref-type="bibr" rid="bib1.bibx13" id="paren.15"/>
serving the global climate and weather research enterprise.</p>
      <p id="d1e510">Managing and sharing this huge amount of data is an enterprise in its own
right – and the solution established for CMIP5 was the global Earth System
Grid Federation <xref ref-type="bibr" rid="bib1.bibx41 bib1.bibx42" id="paren.16"><named-content content-type="pre">ESGF,</named-content></xref>.
ESGF was identified by the WCRP Joint Scientific Committee in 2013 as the
recommended infrastructure for data archiving and dissemination for the
programme. A map of sites participating in the ESGF is shown in
Fig. <xref ref-type="fig" rid="Ch1.F1"/> drawn from the IS-ENES data
portal<fn id="Ch1.Footn7"><p id="d1e520"><uri>https://portal.enes.org/data/is-enes-data-infrastructure/esgf</uri>
(last access: 17 August 2018)</p></fn>. The sites are diverse and responsive to many
national and institutional missions. With multiple agencies and institutions,
and many uncoordinated and possibly conflicting requirements, the ESGF itself
is a complex and delicate artifact to manage.</p>

      <?xmltex \floatpos{t}?><fig id="Ch1.F1" specific-use="star"><caption><p id="d1e528">Sites participating in the Earth System Grid Federation in May 2017.
Figure courtesy of the IS-ENES data portal.</p></caption>
        <?xmltex \igopts{width=426.791339pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018-f01.png"/>

      </fig>

      <p id="d1e537">The sheer size and complexity of this infrastructure emerged as a matter of
great concern at the end of CMIP5, when the growth in data volume relative to
CMIP3 (from 40 TB to 2 PB, a 50-fold increase in 6 years) suggested the
community was on an unsustainable path. These concerns led to the 2014
recommendation of the WGCM to form an infrastructure panel (based
upon a
proposal<fn id="Ch1.Footn8"><p id="d1e540"><uri>https://drive.google.com/file/d/0B7Pi4aN9R3k3OHpIWC16Z0JBX3c/view?usp=sharing</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> at the 2013 annual meeting). The WGCM
Infrastructure Panel (WIP) was tasked with examining the global computational
and data infrastructure underpinning CMIP, and improving communication
between the teams overseeing the scientific and experimental design of these
globally coordinated experiments, and the teams providing resources and
designing that infrastructure. The communication was intended to be two-way:
providing input both to the provisioning of infrastructure appropriate to the
experimental design, and informing the scientific design of the technical
(and financial) limits of that infrastructure.</p>
      <p id="d1e547">This paper provides a summary of the findings by the WIP in the first 3
years of activity since its formation in 2014, and the consequent
recommendations – in the context of existing organizational and funding
constraints. In the text below, we refer to “findings”,
“requirements”, and “recommendations”. Findings refer to
observations about the state of affairs: technologies, resource constraints,
and the like, based upon our analysis. Requirements are design goals that
have been shared with those building the infrastructure, such as the ESGF
software and security stack. Recommendations are our guidance to the
community: experiment designers, modelling centres, and the users of climate
data.</p>
      <p id="d1e551">The intended audience for the paper is primarily the CMIP6 scientific
community. In particular, we aim to show how the scientific design of CMIP6
as outlined in <xref ref-type="bibr" rid="bib1.bibx14" id="text.17"/> translates into infrastructural
requirements. We hope this will be instructive to the MIP chairs and creators
of multi-model experiments highlighting resource implications of their
experimental design, and for data providers (modelling centres), to explain
the sometimes opaque requirements imposed upon them as a requisite for
participation. By describing how the design of this infrastructure is severely
constrained by resources, we hope to provide a useful perspective to those
who find data acquisition and analysis a technical challenge. Finally, we
hope this will be of interest to general readers of the journal from other
geoscience fields, illuminating the particular character of global data
infrastructure for climate data, where the community of users far outstrip in
numbers and diversity, the Earth system modelling community itself.</p>
      <p id="d1e557">In Sect. <xref ref-type="sec" rid="Ch1.S2"/>, the principles and scientific rationale
underlying the requirements for global data infrastructure are articulated.
In Sect. <xref ref-type="sec" rid="Ch1.S3"/> the CMIP6 data request is covered: standards and
conventions, requirements for modelling centres to process a complex data
request, and projections of data volume. In Sect. <xref ref-type="sec" rid="Ch1.S4"/>, the
recent evolution in how data are archived is reviewed alongside a licensing
strategy consistent with current practice and scientific principle. In
Sect. <xref ref-type="sec" rid="Ch1.S5"/> issues surrounding data as a citable resource are
discussed, including the technical infrastructure for the creation of citable
data, and the documentation and other standards required to make data a
first-class scientific entity. In Sect. <xref ref-type="sec" rid="Ch1.S6"/> the implications of
data replicas, and in Sect. <xref ref-type="sec" rid="Ch1.S7"/> issues surrounding data
versioning, retraction, and errata are addressed. Section <xref ref-type="sec" rid="Ch1.S8"/>
provides an outlook for the future of global data infrastructure, looking
beyond CMIP6 towards a unified view of the “vast machine” for weather and
climate data and computation.</p>
</sec>
<?pagebreak page3662?><sec id="Ch1.S2">
  <title>Principles and constraints</title>
      <p id="d1e581">This section lays out some of the principles and constraints which
have resulted from the evolution of infrastructure requirements since
the first CMIP experiment – beginning with a historical context.</p>
<sec id="Ch1.S2.SS1">
  <title>Historical context</title>
      <p id="d1e589">In the pioneering days of CMIP, the community of participants was small and
well-knit, and all the issues involved in generating datasets for common
analysis from different modelling groups were settled by mutual agreement
(Ron Stouffer, personal communication, 2016).
Analysis was performed by the same
community that performed the simulations. The Program for Climate Model
Diagnosis and Intercomparison (PCMDI), established at Lawrence Livermore
National Laboratory (USA) in 1989, had championed the idea of a more systematic
analysis of models, and in close cooperation with the climate modelling
centres, PCMDI assumed responsibility for much of the day-to-day coordination
of CMIP. Until CMIP3, the hosting of datasets from different modelling groups
could be managed at a single archiving site; PCMDI alone hosted the entire
40 TB archive.</p>
      <p id="d1e592">From its earliest phases, CMIP grew in importance, and its results
have provided a major pillar that supports the periodic
Intergovernmental Panel on Climate Change (IPCC) assessment
activities. However, the explosive growth in the scope of CMIP,
especially between CMIP3 and CMIP5, represented a tipping point in the
supporting infrastructure. Not only was it clear that no one site
could manage all the data, the necessary infrastructure software and
operational principles could no longer be delivered and managed by
PCMDI alone.</p>
      <p id="d1e595">For CMIP5, PCMDI sought help from a number of partners under the
auspices of the Global Organisation of Earth System Science Portals
(GO-ESSP). Many of the GO-ESSP partners who became the foundation
members and developers of the Earth System Grid Federation re-targeted
existing research funding to help develop ESGF. The primary heritage
derived from the original US Earth System Grid project funded by the
US Department of Energy, but increasingly major contributions came
from new international partners. This meant that many aspects of the
ESGF system began from work which was designed in the context of
different requirements, collaborations, and objectives. At the
beginning, none of the partners had funds for operational support for
the fledgling international federation, and even after the end of
CMIP5 proper (circa 2014), the ongoing ESGF has been sustained
primarily by small amounts of funding at a handful of the primary ESGF
sites. Most ESGF sites have had little or no formal operational
support. Many of the known limitations of the CMIP5 ESGF – both in
terms of functionality and performance – were a direct consequence of
this heritage.<?xmltex \hack{\newpage}?></p>
      <p id="d1e599">With the advent of CMIP6 (in addition to some sister projects such as
obs4MIPs, input4MIPs, and CREATE-IP), it was clear that a fundamental
reassessment would be needed to address the evolving scientific and
operational requirements. That clarity led to the establishment of the
WIP, but it has yet to lead to any formal joint funding arrangement –
the ESGF and the data nodes within it remain funded (if at all, many
data nodes are marginal activities supported on best efforts) by
national agencies with disparate timescales and objectives. Several
critical software elements also are being developed on volunteer
efforts and shoestring budgets. This finding has been noted in the US
National Academies Report on “A National Strategy for Advancing
Climate Modeling” <xref ref-type="bibr" rid="bib1.bibx34" id="paren.18"/>, which warned of the
consequences of inadequate infrastructure funding.</p>
</sec>
<sec id="Ch1.S2.SS2">
  <title>Infrastructural principles</title>
      <p id="d1e611"><list list-type="custom">
            <list-item><label>1.</label>

      <p id="d1e616">With greater complexity and a globally distributed data
resource, it has become clear that in the design of globally coordinated
scientific experiments, the global computational and data infrastructure
needs to be formally examined as an integrated element.</p>

      <p id="d1e619">The membership of the WIP, drawn as it is from experts in various
aspects of the infrastructure, is a direct consequence of this
requirement for integration. Representatives of modelling centres,
infrastructure developers, and stakeholders in the scientific design
of CMIP and its output comprise the panel membership. One of the
WIP's first acts was to consider three phases in the process of
infrastructure development: requirements,
implementation, and operations, all informed by the
builders of workflows at the modelling centres.</p>

      <p id="d1e622"><list list-type="bullet">
                  <list-item>

      <p id="d1e627">The WIP, in consort with the WCRP's CMIP Panel, takes
responsibility for articulating the requirements for the
infrastructure.</p>
                  </list-item>
                  <list-item>

      <p id="d1e633">The implementation is in the hands of the
infrastructure developers, principally ESGF for the federated
archive <xref ref-type="bibr" rid="bib1.bibx42" id="paren.19"/>, but also related projects
like Earth System Documentation
<xref ref-type="bibr" rid="bib1.bibx21" id="paren.20"><named-content content-type="pre">ES-DOC<fn id="Ch1.Footn9"><p id="d1e642"><uri>https://www.earthsystemcog.org/projects/es-doc-models/</uri>
(last access: 17 August 2018)</p></fn>,</named-content></xref>.</p>
                  </list-item>
                  <list-item>

      <?pagebreak page3663?><p id="d1e653">In 2016 at the WIP's request, the “CMIP6 Data Node
Operations Team” (CDNOT) was formed. It is charged with
ensuring that all the infrastructure elements needed by CMIP6 are
properly deployed and actually working as intended at the sites
hosting CMIP6 data. It is also responsible for the operational
aspects of the federation itself, including specifying what
versions of the toolchain are run at every site at any given time,
and organizing coordinated version and security upgrades across
the federation.</p>
                  </list-item>
                </list></p>

      <p id="d1e658">Although there is now a clear separation of concerns into
requirements, implementation, and operations, close links are
maintained by cross-membership between the key bodies, including the
WIP itself, the CMIP Panel, the ESGF Executive Committee, and the
CDNOT.</p>
            </list-item>
            <list-item id="Ch1.I1.i2"><label>2.</label>

      <p id="d1e664">With the basic fact of anthropogenic climate change
now well established <xref ref-type="bibr" rid="bib1.bibx38" id="paren.21"><named-content content-type="pre">see, e.g.</named-content></xref> the
scientific communities with an interest in CMIP are expanding. For
example, a substantial body of work has begun to emerge to examine
climate impacts. In addition to the specialists in Earth system
science – who also design and run the experiments and produce the
model output – those relying on CMIP output now include those
developing and providing climate services, as well as
“consumers” from allied fields studying the impacts of climate
change on health, agriculture, natural resources, human migration,
and similar issues <xref ref-type="bibr" rid="bib1.bibx33" id="paren.22"/>. This confronts us with
a “scientific scalability” issue (the data during its lifetime
will be consumed by a community much larger, both in sheer numbers,
and also in breadth of interest and perspective than the Earth
system modelling community itself), which needs to be addressed.</p>

      <p id="d1e675">Accordingly, we note the requirement that infrastructure should
ensure maximum transparency and usability for user (consumer)
communities at some distance from the modelling (producer)
communities.</p>
            </list-item>
            <list-item id="Ch1.I1.i3"><label>3.</label>

      <p id="d1e681">While CMIP and the IPCC are formally independent,
the CMIP archive is increasingly a reference in formulating climate
policy. Hence the scientific reproducibility
<xref ref-type="bibr" rid="bib1.bibx9" id="paren.23"/> and the underlying durability
and provenance of data have now become matters of central
importance: the ability, long after the creation of the dataset, to trace back
from model output to the configuration of models and the procedures
and choices made along the way. This led the IPCC to require data
distribution centres (DDCs) that attempt to guarantee the archiving
and dissemination of these data in perpetuity, and subsequently to a
requirement in the CMIP context of achieving reproducibility. Given
the use of multi-model ensembles for both consensus estimates and
uncertainty bounds on climate projections, it is important to
document – as precisely as possible, given the independent
genealogy and structure of many models – the details and
differences among model configurations and analysis methods, to
deliver both the requisite provenance and the routes to
reproduction.</p>
            </list-item>
            <list-item id="Ch1.I1.i4"><label>4.</label>

      <p id="d1e690">With the expectation that CMIP DECK experiment
results should be routinely contributed to CMIP, opportunities now
exist for engaging in a more systematic and routine evaluation of
Earth system models (ESMs). This has led to community efforts to
develop standard metrics of model “quality”
<xref ref-type="bibr" rid="bib1.bibx15 bib1.bibx18" id="paren.24"/>. Typical multi-model
analysis has hitherto taken the multi-model average, assigning equal
weight to each model, as the most likely estimate of climate
response. This “model democracy” <xref ref-type="bibr" rid="bib1.bibx27" id="paren.25"/> has been
called into question and there is now a considerable literature
exploring the potential of weighting models by quality
<xref ref-type="bibr" rid="bib1.bibx28" id="paren.26"/>. The development of standard metrics
would aid this kind of research.</p>

      <p id="d1e702">To that end, there is now a requirement to enable (through the ESGF) a
framework for accommodating quasi-operational evaluation tools that
could routinely execute a series of standardized evaluation tasks.
This would provide data consumers with an increasingly (over time)
systematic characterization of models. It may be some time before a
fully operational system of this kind can be implemented, but
planning must start now.</p>

      <p id="d1e705">In addition, there is an increased interest in climate analytics as
a service <xref ref-type="bibr" rid="bib1.bibx4 bib1.bibx37" id="paren.27"/>. This
follows the principle of placing analysis close to the data. Some
centres plan to add resources that combine archiving and analysis
capabilities, for example, NCAR's CMIP Analysis
Platform<fn id="Ch1.Footn10"><p id="d1e711"><uri>https://www2.cisl.ucar.edu/resources/cmip-analysis-platform</uri>
(last access: 17 August 2018)</p></fn>, or the UK's JASMIN
<xref ref-type="bibr" rid="bib1.bibx31" id="paren.28"/>. There are also new efforts to bring
climate data storage and analysis to the cloud era
<xref ref-type="bibr" rid="bib1.bibx11" id="paren.29"><named-content content-type="pre">e.g</named-content></xref><fn id="Ch1.Footn11"><p id="d1e724"><uri>https://github.com/ESGF/esgf-compute-api</uri> (last access: 17 August 2018)</p></fn>.
Platforms such as Pangeo<fn id="Ch1.Footn12"><p id="d1e730"><uri>http://pangeo-data.org/</uri>
(last access: 17 August 2018)</p></fn> show promise in this realm, and widespread
experimentation and adoption is encouraged.</p>
            </list-item>
            <list-item><label>5.</label>

      <p id="d1e739">As the experimental design of CMIP has grown in complexity,
costs both in time and money have become a matter of great concern,
particularly for those designing, carrying out, and storing
simulations. In order to justify commitment of resources to CMIP,
mechanisms to identify costs and benefits in developing new models,
performing CMIP simulations, and disseminating the model output need
to be developed.</p>

      <?pagebreak page3664?><p id="d1e742">To quantify the scientific impact of CMIP, measures are needed to
track the use of model output and its value to consumers. In
addition to usage quantification, credit and tracing data usage in
literature via citation of data is important. Current practice is at
best citing large data collections provided by a CMIP participant,
or all of CMIP. Accordingly, we note the need for a mechanism to
identify and cite data provided by each modelling centre.
Alongside the intellectual contribution to model development, which
can be recognized by citation, there is a material cost to centres
in computing and data processing, which is both burdensome and
poorly understood by those requesting, designing, and using the
results from CMIP experiments, who might not be in the business of
model development. The criteria for endorsement introduced in CMIP6
<xref ref-type="bibr" rid="bib1.bibx14" id="paren.30"><named-content content-type="pre">see Table 1 in</named-content></xref> begin to grapple with
this issue, but the costs still need to be measured and recorded. To
begin documenting these costs for CMIP6, the “Computational
Performance” MIP project (CPMIP) <xref ref-type="bibr" rid="bib1.bibx5" id="paren.31"/> has
been established, which will measure, among other things, throughput
(simulated years per day) and cost (core-hours and joules per
simulated year) as a function of model resolution and complexity.
New tools for estimating data volumes have also been developed, see
Sect. <xref ref-type="sec" rid="Ch1.S3.SS1"/> below.</p>
            </list-item>
            <list-item id="Ch1.I1.i6"><label>6.</label>

      <p id="d1e759">Experimental specifications have become ever more
complex, making it difficult to verify that experiment
configurations conform to those specifications. Several modelling
centres have encountered this problem in preparing for CMIP6,
noting, for example, the challenging intricacies in dealing with
input forcing data <xref ref-type="bibr" rid="bib1.bibx12" id="paren.32"><named-content content-type="pre">see</named-content></xref>, output
variable lists <xref ref-type="bibr" rid="bib1.bibx26" id="paren.33"/>, and crossover
requirements between the endorsed MIPs and the DECK
<xref ref-type="bibr" rid="bib1.bibx14" id="paren.34"/>. Moreover, these protocols inevitably
evolve over time, as errors are discovered or enhancements proposed,
and centres needed to be adaptable in their workflows accordingly.</p>

      <p id="d1e773">Therefore, we note a requirement to encode the protocols to be
directly ingested by workflows, in other words,
“machine-readable experiment design”. The intent is to avoid,
as far as possible, errors in conformance to design requirements
introduced by the need for humans to transcribe and implement the
protocols, for instance, deciding what variables to save from what
experiments. This is accomplished by encoding most of the
specifications in standard, structured, and machine readable text
formats (XML and JSON) which can be directly read by the scripts
running the model and post-processing, as explained further below in
Sect. <xref ref-type="sec" rid="Ch1.S3"/>. The requirement spans all of the
“controlled vocabularies” CMIP6_CVs<fn id="Ch1.Footn13"><p id="d1e778"><uri>https://github.com/WCRP-CMIP/CMIP6_CVs</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>: for instance the names assigned to models,
experiments, and output variables used in the CMIP protocols as
well as the CMIP6 data request <xref ref-type="bibr" rid="bib1.bibx26" id="paren.35"/>, which
must be stored in version-controlled, machine-readable formats.
Precisely documenting the “conformance” of experiments to the
protocols <xref ref-type="bibr" rid="bib1.bibx30" id="paren.36"/> is an additional requirement.</p>
            </list-item>
            <list-item id="Ch1.I1.i7"><label>7.</label>

      <p id="d1e794">The transition from a unitary archive at PCMDI in
CMIP3 to a globally federated archive in CMIP5 led to many changes
in the way users interact with the archive, which impacts management
of information about users and complicates communications with them.
In particular, a growing number of data users no longer registered
or interacted directly with the ESGF. Rather they relied on
secondary repositories, often copies of some portion of the ESGF
archive created by others at a particular time (see, for instance, the IPCC CMIP5 Data
Factsheet<fn id="Ch1.Footn14"><p id="d1e797"><uri>http://www.ipcc-data.org/docs/factsheets/TGICA_Fact_Sheet_CMIP5_data_provided_at_the_IPCC_DDC_Ver_1_2016.pdf</uri>
(last access: 17 August 2018)</p></fn> for a discussion of the snapshots and their
coverage). This meant that reliance on the ESGF's inventory of
registered users for any aspect of the infrastructure – such as
tracking usage, compliance with licensing requirements, or informing
users about errata or retractions – could at best ensure partial
coverage of the user base.</p>

      <p id="d1e803">This key finding implies a more distributed design for several
features outlined below, which devolve many of these features to the
datasets themselves rather than the archives. One may think of this
as a “dataset-centric rather than system-centric” design (in
software terms, a “pull” rather than “push” design):
information is made available upon request at the user/dataset
level, relieving the ESGF implementation of an impossible burden.</p>
            </list-item>
          </list></p>
      <p id="d1e808">Based upon the above considerations, the WIP produced a set of position
papers (see Appendix <xref ref-type="sec" rid="App1.Ch1.S1"/>) encapsulating specifications and
recommendations for CMIP6 and beyond. These papers, summarized below, are
available from the WIP
website<fn id="Ch1.Footn15"><p id="d1e813"><uri>https://www.earthsystemcog.org/projects/wip/</uri> <?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>. As the WIP continues to develop additional
recommendations, they too will be made available. As requirements evolve, a
modified document will be released with a new version number.</p>
</sec>
</sec>
<sec id="Ch1.S3">
  <title>A structured approach to data production</title>
      <?pagebreak page3665?><p id="d1e828">The CMIP6 data framework has evolved considerably from CMIP5, and follows the
principles of scientific reproducibility (Item 3 in
Sect. 2.2) and the recognition that the complexity of the
experimental design (Item 6) required far greater degrees of
automation within the production workflow generating simulation results. As a
starting point, all elements in the experiment specifications must be
recorded in structured text formats (XML and JSON, for example), and any
changes must be tracked through careful version control.
“Machine-readable” specification of all aspects of the model output
configuration is a design goal, as noted earlier.<?xmltex \hack{\newpage}?></p>
      <p id="d1e832">The data request spans several elements discussed in sub-sections
below.</p>
<sec id="Ch1.S3.SS1">
  <title>CMIP6 data request</title>
      <p id="d1e840">The CMIP6 data
request<fn id="Ch1.Footn16"><p id="d1e843"><uri>http://clipc-services.ceda.ac.uk/dreq/index.html</uri> <?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> specifies which variables should be archived for
each experiment. It is one of the most complex elements of the CMIP6
infrastructure due to the complexity of the new design outlined in
<xref ref-type="bibr" rid="bib1.bibx14" id="normal.37"/>. The experimental design now involves three tiers of
experiments, where an individual modelling group may choose which ones to
perform. The variables are also grouped by scientific goals and priorities, where
again centres may choose which sets to publish, based on interests and
resource constraints. There are also cross-experiment data requests, where
for instance the design may require a variable in one experiment to be
compared against the same variable from a different experiment. The modelling
groups will then need to take this into account before beginning their
simulations. The CMIP6 data request is a codification of the entire
experimental design into a structured set of machine-readable documents,
which can in principle be directly ingested in data workflows.</p>
      <p id="d1e854">The CMIP6 data
request<inline-formula><mml:math id="M5" display="inline"><mml:msup><mml:mi/><mml:mn mathvariant="normal">16</mml:mn></mml:msup></mml:math></inline-formula> <xref ref-type="bibr" rid="bib1.bibx26" id="paren.38"/> combines definitions of
variables and their output format with specifications of the objectives they
support and the experiments that they are required for. The entire request is
encoded in an XML database with rigorous type constraints. Important elements
of the request, such as units, cell methods (expressing the subgrid
processing implicit in the variable definition), sampling frequencies, and
time “slices” (subsets of the entire simulation period as defined in the
experimental design) for required output, are defined using controlled
vocabularies that ensure consistency of interpretation. The request is
designed to enable flexibility, allowing modelling centres to make informed
decisions about the variables they should submit to the CMIP6 archive from
each experiment.</p>
      <p id="d1e869">In order to facilitate the cross linking between the 2100 variables
from the 287 experiments, the request database allows MIPs to
aggregate variables and experiments into groups. This allows MIPs to
designate variable groups by priority and provides for queries that
return the list of variables needed from any given experiment at a
specified time slice and frequency.</p>
      <p id="d1e872">This formulation takes into account the complexities that arise when a
particular MIP requests that variables needed for their own
experiments should also be saved from a DECK experiment or from an
experiment proposed by a different MIP.</p>
      <p id="d1e876">The data request supports a broad range of users who are provided with
a range of different access points. These include the entire
codification in the form of a structured (XML) document, web pages, or
spreadsheets, as well as a Python API and command-line tools, to
satisfy a wide variety of usage patterns for accessing the data
request information.</p>
      <p id="d1e879">The data request's machine-readable database has been an extraordinary
resource for the modelling centres. They can, for example, directly
integrate the request specifications with their workflows to ensure
that the correct set of variables are saved for each experiment they
plan to run. In addition, it has given them a new-found ability to
estimate the data volume associated with meeting a MIP's requirements,
a feature exploited below in Sect. <xref ref-type="sec" rid="Ch1.S3.SS4"/>.</p>
</sec>
<sec id="Ch1.S3.SS2">
  <title>Model inputs</title>
      <p id="d1e890">Datasets used by the model for the configuration of model inputs
<xref ref-type="bibr" rid="bib1.bibx12" id="paren.39"><named-content content-type="pre">Input Datasets for Model Intercomparison Projects)
input4MIPs, see</named-content></xref>
as well as observations for the
comparison with models <xref ref-type="bibr" rid="bib1.bibx40 bib1.bibx16" id="paren.40"><named-content content-type="pre">Observations for Model Intercomparison
Projects) obs4MIPs, see</named-content></xref> are
both now organized in the same way, and share many of the naming and metadata
conventions as the CMIP model output itself. The coherence of standards
across model inputs, outputs, and observational datasets is a development
that will enable the community to build a rich toolset across all of these
datasets. The datasets follow the versioning methodologies described in
Sect. <xref ref-type="sec" rid="Ch1.S7"/>.</p>
</sec>
<sec id="Ch1.S3.SS3">
  <title>Data reference syntax</title>
      <p id="d1e912">The organization of the model output follows the data reference syntax
(DRS)<fn id="Ch1.Footn17"><p id="d1e915"><uri>https://docs.google.com/document/d/1h0r8RZr_f3-8egBMMh7aqLwy3snpD6_MrDz1q8n5XUk/edit?usp=sharing</uri>
(last access: 17 August 2018)</p></fn> first used in CMIP5, and now in a somewhat
modified form in CMIP6. The DRS depends on predefined controlled
vocabularies
CMIP6_CVs<fn id="Ch1.Footn18"><p id="d1e921"><uri>https://github.com/WCRP-CMIP/CMIP6_CVs</uri> <?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> for various terms including the names of
institutions, models, experiments, time frequencies, etc. The CVs are now
recorded as a version-controlled set of structured text documents, and
satisfies the requirement that there is a single authoritative source for any
CV<inline-formula><mml:math id="M6" display="inline"><mml:msup><mml:mi/><mml:mn mathvariant="normal">18</mml:mn></mml:msup></mml:math></inline-formula>, on which all elements in the toolchain will rely. The DRS
elements that rely on these controlled vocabularies appear as netCDF
attributes and are used in constructing file names, directory names, and
unique identifiers of datasets that are essential throughout the CMIP6
infrastructure. These aspects are covered in detail in the CMIP6 Global
Attributes, DRS, Filenames, Directory Structure, and
CVs<fn id="Ch1.Footn19"><p id="d1e938"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_global_attributes_filenames_CVs_v6.2.6.pdf</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> position paper. A new element in the DRS
indicates whether data have been stored on a native grid or have been
regridded (see discussion below in Sect. <xref ref-type="sec" rid="Ch1.S3.SS4"/> on the potentially
critical role of regridded output). This element of the DRS will allow us to
track the usage of the regridded subset of data and assess the
relative popularity of native-grid vs. standard-grid output.</p>
</sec>
<?pagebreak page3666?><sec id="Ch1.S3.SS4">
  <title>CMIP6 data volumes</title>
      <p id="d1e953">As noted, extrapolations based on CMIP3 and CMIP5 lead to some alarming
trends in data volume <xref ref-type="bibr" rid="bib1.bibx35" id="paren.41"><named-content content-type="pre">e.g.</named-content></xref>. As seen in
Fig. 2 in <xref ref-type="bibr" rid="bib1.bibx35" id="text.42"/>, model output such as those from the various CMIP phases (1
through 6) are beginning to rival the observational data volume. As noted in the
introduction, a particular problem for our community is the diverse and very
large user base for the data, many of whom are not climate specialists, but
downstream users of climate data studying the impacts of climate change. This
stands in contrast to other fields with comparably large data holdings: data
from the Large Hadron Collider <xref ref-type="bibr" rid="bib1.bibx1" id="paren.43"><named-content content-type="pre">e.g.</named-content></xref>, for example,
are primarily consumed by high-energy physicists and not of direct interest to
anyone else.</p>
      <p id="d1e969">A rigorous approach is needed to estimate future data volumes, rather
than relying on simple extrapolation. Contributions to the increase in
data volume include the systematic increase in model resolution and
complexity of the experimental protocol and data request. We consider
these separately:</p>
<sec id="Ch1.S3.SS4.SSS1">
  <title>Resolution</title>
      <p id="d1e977">The median horizontal resolution of a CMIP model
tends to grow with time, and is typically expected to be 100 km
in CMIP6, compared with 200 km in CMIP5. Generally the temporal
resolution of the model (although not the data) is doubled as well,
for reasons of numerical stability. Thus, for an <inline-formula><mml:math id="M7" display="inline"><mml:mi>N</mml:mi></mml:math></inline-formula>-fold increase
in horizontal resolution, we require an <inline-formula><mml:math id="M8" display="inline"><mml:mrow><mml:msup><mml:mi>N</mml:mi><mml:mn mathvariant="normal">3</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula> increase in
computational capacity. The vertical resolution grows in a more
controlled fashion, at least as far as the data are concerned, as
often the requested output is reported on a standard set of
atmospheric levels that has not changed much over the years.
Similarly the temporal resolution of the data request does not
increase at the same rate as the model time step: monthly averages
remain monthly averages. Thus, the <inline-formula><mml:math id="M9" display="inline"><mml:mrow><mml:msup><mml:mi>N</mml:mi><mml:mn mathvariant="normal">3</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula> increase in computational
capacity will result in an <inline-formula><mml:math id="M10" display="inline"><mml:mrow><mml:msup><mml:mi>N</mml:mi><mml:mn mathvariant="normal">2</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula> increase in data volume,
ceteris paribus. Thus, data volume (<inline-formula><mml:math id="M11" display="inline"><mml:mi>V</mml:mi></mml:math></inline-formula>) and computational
capacity (<inline-formula><mml:math id="M12" display="inline"><mml:mi>C</mml:mi></mml:math></inline-formula>) are related as <inline-formula><mml:math id="M13" display="inline"><mml:mrow><mml:mi>V</mml:mi><mml:mo>∼</mml:mo><mml:msup><mml:mi>C</mml:mi><mml:mrow><mml:mn mathvariant="normal">2</mml:mn><mml:mo>/</mml:mo><mml:mn mathvariant="normal">3</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, purely from the
point of view of resolution. Consequently, if centres then
experience an 8-fold increase in <inline-formula><mml:math id="M14" display="inline"><mml:mi>C</mml:mi></mml:math></inline-formula> between CMIPs, we can expect a
doubling of model resolution and an approximate quadrupling of the
data volume (see discussion in the CMIP6 Output Grid Guidance
document<fn id="Ch1.Footn20"><p id="d1e1062"><uri>https://docs.google.com/document/d/1kZw3KXvhRAJdBrXHhXo4f6PDl_NzrFre1UfWGHISPz4/edit?ts=5995cbff</uri>
(last access: 17 August 2018)</p></fn>).<?xmltex \hack{\newpage}?></p>
      <p id="d1e1069">A similar approximate doubling of model resolution occurred between
CMIP3 and CMIP5, but data volume increased 50-fold. What caused that
extraordinary increase?</p>
</sec>
<sec id="Ch1.S3.SS4.SSS2">
  <title>Complexity</title>
      <p id="d1e1078">The answer lies in the complexity of CMIP: the
complexity of the data request and of the experimental protocol. The
first component, the data request complexity, is related to that of
the science: the number of processes being studied, and the physical
variables required for the study, along with the large number of
satellite MIPs (23) that now comprise the CMIP6 project. In CPMIP
<xref ref-type="bibr" rid="bib1.bibx5" id="paren.44"/>, we have attempted a rigorous definition
of this complexity, measured by the number of physical variables
simulated by the model. This, we argue, grows not smoothly like
resolution, but in very distinct generational step transitions, such
as the one from atmosphere–ocean models to Earth system models,
which, as shown in <xref ref-type="bibr" rid="bib1.bibx5" id="text.45"/>, involved a substantial
jump in complexity with regard to the number of physical, chemical,
and biological species being modelled. Many models of the CMIP5 era
added atmospheric chemistry and aerosol–cloud feedbacks, sometimes
with <inline-formula><mml:math id="M15" display="inline"><mml:mrow><mml:mi mathvariant="script">O</mml:mi><mml:mo>(</mml:mo><mml:mn mathvariant="normal">100</mml:mn><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula> species. CMIP5 also marked the first time in
CMIP that ESMs were used to simulate changes in the carbon cycle.</p>
      <p id="d1e1101">The second component of complexity is the experimental protocol, and
the number of experiments themselves when comparing successive
phases of CMIP. The number of experiments (and years simulated) grew
from 12 in CMIP3 to about 50 in CMIP5, greatly inflating the data
produced. With the new structure of CMIP6, with a DECK and 23
endorsed MIPs, the number of experiments has grown tremendously
(from about 50 to 287). We propose as a measure of experimental
complexity, “the total number of simulated years (SYs)” called
for by the experimental protocol. Note that modelling centres must
make trade-offs between experimental complexity and resolution in
deciding their level of participation in CMIP6, as discussed in
<xref ref-type="bibr" rid="bib1.bibx5" id="text.46"/>.</p>
      <p id="d1e1107">Two further steps have been proposed toward ensuring sustainable growth in
data volumes. The first of these is the consideration of standard horizontal
resolutions for saving data, as is already done for vertical and temporal
resolution in the data request. Cross-model analyses already cast all data to
a common grid in order to evaluate it as an ensemble, typically at fairly low
resolution. The studies of Knutti and colleagues (e.g.
<xref ref-type="bibr" rid="bib1.bibx28" id="altparen.47"/>), for example, are typically performed on
relatively coarse grids. Accordingly for most purposes atmospheric data on
the ERA-40 grid (<inline-formula><mml:math id="M16" display="inline"><mml:mrow><mml:mn mathvariant="normal">2</mml:mn><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup><mml:mo>×</mml:mo><mml:mn mathvariant="normal">2.5</mml:mn><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup></mml:mrow></mml:math></inline-formula>) would suffice, with obvious
exceptions for experiments like those called for by HighResMIP
<xref ref-type="bibr" rid="bib1.bibx22" id="paren.48"/>. A similar conclusion applies for ocean data (the
World Ocean Atlas <inline-formula><mml:math id="M17" display="inline"><mml:mrow><mml:mn mathvariant="normal">1</mml:mn><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup><mml:mo>×</mml:mo><mml:mn mathvariant="normal">1</mml:mn><mml:msup><mml:mi/><mml:mo>∘</mml:mo></mml:msup></mml:mrow></mml:math></inline-formula> grid), with extended
discussion of the benefits and losses due to regridding
<xref ref-type="bibr" rid="bib1.bibx19 bib1.bibx20" id="paren.49"><named-content content-type="pre">see</named-content></xref>.</p>
      <p id="d1e1161">This has not been mandated for CMIP6 for a number of reasons. Firstly,
regridding is burdensome on many grounds:<?pagebreak page3667?> it requires considerable
expertise to choose appropriate algorithms for particular variables,
for instance, we may need algorithms that guarantee the exact conservation for
scalars, or the preservation of streamlines for vector fields may be a
requirement; and it can be expensive in terms of computation and
storage. Secondly, regridding is irreversible (which amounts to
“lossy” data reduction) and non-commutative with certain basic
arithmetic operations such as multiplication (i.e. the product of
regridded variables does not in general equal the regridded output of
the product computed on the native grid). This can be problematic for
budget studies. However, the same issues would apply for
time-averaging and other operations long used in the field: much
analysis of CMIP output is performed on monthly averaged data, which
is “lossy” compression along the time axis relative to the model's
time resolution.</p>
      <p id="d1e1165">These issues have contributed to a lack of consensus in moving forward, and
the recommendations on regridding remain in flux. The CMIP6 Output Grid
Guidance document
<fn id="Ch1.Footn21"><p id="d1e1168"><uri>https://docs.google.com/document/d/1kZw3KXvhRAJdBrXHhXo4f6PDl_NzrFre1UfWGHISPz4/edit?ts=5995cbff</uri>
(last access: 17 August 2018)</p></fn> outlines a number of possible recommendations,
including the provision of “weights” to a target grid. Many of the
considerations around regridding, particularly for ocean data in
CMIP6, are discussed at length in <xref ref-type="bibr" rid="bib1.bibx20" id="text.50"/>.</p>
      <p id="d1e1177">There is a similar lack of consensus around whether or not to adopt a common
calendar for particular experiments. In cases such as a long-running
control simulation where all years are equivalent and of no historical
significance, it is customary in this community to use simplified calendars
– such as a Julian, a “no-leap” (365-day), or an “equal-month” (360-day)
calendar – rather than the Gregorian. However, comparison across datasets
using different calendars can be a frustrating burden on the end-user. However, there
is no consensus at this point to impose a particular calendar.</p>
      <p id="d1e1180">As outlined below in Sect. <xref ref-type="sec" rid="Ch1.S6"/>, both ESGF data nodes and the
creators of secondary repositories are given considerable leeway in choosing
data subsets for replication, based on their own interests. The tracking
mechanisms outlined in Sect. <xref ref-type="sec" rid="Ch1.S5.SS2"/> below will allow us to
ascertain, after the fact, how widely used the native grid data may be
vis-à-vis the regridded subset, and allow us to recalibrate the
replicas, as usage data becomes available. We also note that the providers of
at least one of the standard metrics packages
<xref ref-type="bibr" rid="bib1.bibx14" id="paren.51"><named-content content-type="pre">ESMValTool,</named-content></xref> have expressed a preference of
standard grid data for their analysis, as regridding from disparate grids
increases the complexity of their already overburdened infrastructure.</p>
      <p id="d1e1192">A second method of data reduction for the purposes of storage and
transmission is the issue of data compression. The netCDF4 software,
which is used in writing CMIP6 data, includes an option for lossless
compression or deflation <xref ref-type="bibr" rid="bib1.bibx44" id="paren.52"/> that relies on the
same technique used in standard tools such as gzip. In
practice, the reduction in data volume will depend upon the
“entropy” or randomness in the data, with smoother data or fields
with many missing data points (e.g. land or ocean) being compressed
more.</p>
      <p id="d1e1198">Dealing with compressed data entails computational costs, not only
during its creation, but also every time the data are reinflated.
There is also a subtle interplay with precision: for instance
temperatures usually seen in climate models appear to deflate better
when expressed in Kelvin, rather than Celsius, but that is due to the
fact that the leading order bits are always the same; thus, the
data is actually less precise. Deflation is also enhanced by
reorganizing (“shuffling”) the data internally into chunks that have
spatial and temporal coherence.</p>
      <p id="d1e1201">Some argue for the use of more aggressive lossy compression methods
<xref ref-type="bibr" rid="bib1.bibx3" id="paren.53"/>, but for CMIP6 it can be argued that the resulting
loss of precision and the consequences for scientific results require
considerably more evaluation by the community before such methods can be
accepted. However, as noted above, some lossy methods of data reduction
(e.g. time-averaging) have long been common practice.</p>
      <p id="d1e1208">To help inform the discussion about compression, we undertook a
systematic study of typical model output files under lossless
compression, the results of which are publicly
available<fn id="Ch1.Footn22"><p id="d1e1211"><uri>https://public.tableau.com/profile/balticbirch#!/vizhome/NC4/NetCDF4Deflation</uri>
(last access: 17 August 2018)</p></fn>. The study indicates that standard zlib
compression in the netCDF4 library with the settings of
<monospace>deflate=2</monospace> (relatively modest, and computationally
inexpensive), and <monospace>shuffle</monospace> (which ensures better
spatiotemporal homogeneity) ensures the best compromise between
increased computational cost and reduced data volume. For an ESM, we
expect a total savings of about 50 %, with ocean, ice, and land realms
benefiting most (owing to large areas of the globe that are masked)
and atmospheric data benefiting least. This 50 % estimate has been
verified with sample output from one model whose compression rates
should be quite typical.</p>
      <p id="d1e1223">The DREQ<fn id="Ch1.Footn23"><p id="d1e1226"><uri>https://earthsystemcog.org/projects/wip/CMIP6DataRequest</uri>
(last access: 17 August 2018)</p></fn> alluded to above in Sect. <xref ref-type="sec" rid="Ch1.S3"/>
allows us to estimate expected data volumes. The software generates an
estimate given the model's resolution along with the experiments that will be
performed and the data one intends to save (using DREQ's priority
attribute).</p>
      <p id="d1e1234">For instance, analyses available at the DREQ
site<fn id="Ch1.Footn24"><p id="d1e1237"><uri>http://clipc-services.ceda.ac.uk/dreq/tab01_3_3.html</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> indicate that if a centre were to undertake every
single experiment (all tiers) and save every single variable requested
(all priorities) at a “typical” resolution, it would generate about
800 TB of data, using the guidelines above. Given 100 participating
models, this translates to an upper bound of 80 PB for the entire
CMIP6 archive, although in practice most centres are planning to perform
a modest subset of experiments and save only a subset of variables,
based on their scientific priorities and available computational and
storage resources. The WIP carried out a survey of modelling centres
in 2016, asking them for their expected model resolutions, and
intentions of participating in various experiments. Based on that
survey, we have initially forecast a compressed data volume of 18 PB
for CMIP6. This number, 18 PB, is about 7 times the CMIP5 archive
size. The causes for this dramatic increase in data volume between
CMIP3 and CMIP5 were noted above. There is no comparable jump between
CMIP5 and CMIP6. CMIP6's innovative DECK/endorsed-MIP structure could
be considered successful in that it has limited the rate of growth in
data volume.</p>
      <p id="d1e1244">Prior to CMIP5, similar analyses were undertaken at PCMDI to estimate data
volume and the predicted volume proved reasonably accurate. However, the methods used
for CMIP5 could not be applied to CMIP6 because they depended on
having a much less complex data request. In particular, the cross-MIP data
requests (variables requested by one MIP from another MIP, or the DECK)
require a more sophisticated algorithm. The experience in many modelling
centres as present is that data volume estimates become available only after
the production runs have begun. Reliable estimates ahead of time
based on nothing more than the experimental protocols and model resolutions
are valuable for preparation and planning hardware acquisitions.</p>
      <p id="d1e1247">It should be noted that reporting output on a lower resolution
standard grid (rather than the native model grid) could shrink the
estimated data volume 10-fold, to 1.8 PB. This is an important number,
as will be seen below in Sect. <xref ref-type="sec" rid="Ch1.S6"/>: the managers of
Tier 1 nodes (the largest nodes in the federation) have indicated that
2 PB is about the practical limit for replicated storage of data from
all CMIP6 models. This target could be achieved by requiring compression and the use of
reduced-resolution standard grids, but modelling centres are free to choose
whether or not to compress and regrid.</p>
</sec>
</sec>
</sec>
<?pagebreak page3668?><sec id="Ch1.S4">
  <title>Licensing</title>
      <p id="d1e1260">The licensing policy established for CMIP6 is based on an examination
of data usage patterns in CMIP5. First, while in CMIP5 the licensing
policy called for registration and acceptance of the terms of use, a
large fraction, perhaps a majority of users, actually obtained their
data not directly from ESGF, but from third-party copies, such as the
“snapshots” alluded to in Item 7,
Sect. <xref ref-type="sec" rid="Ch1.S2"/>. Those users accessing the data
indirectly, as shown in Fig. <xref ref-type="fig" rid="Ch1.F2"/>, relied on user groups
or their home institutions to make secondary repositories that could
be more conveniently accessed. The WIP CMIP6 Licensing and Access
Control<fn id="Ch1.Footn25"><p id="d1e1267"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Licensing_and_Access_Control.pdf</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> position paper refers to the secondary
repositories as “dark” and those obtaining CMIP data from those
repositories as “dark users” who are invisible to the ESGF system.
While this appears to subvert the licensing and registration policy
put in place for CMIP5, this should not be seen as a “bootleg”
process: it is in fact the most efficient use of limited network
bandwidth and storage at the user sites. In CMIP6 we expect similar
data archive snapshots to host data and offload some of the network
provisioning requirements from the ESGF nodes.</p>
      <p id="d1e1274">At the same time we wish to retain the ability for users of these
“dark” repositories to benefit from the augmented provenance
services provided by infrastructure advances, where a user can inform
themselves or be notified of data retractions or replacements when
contributed datasets are found to be erroneous and replaced (see
Sect. <xref ref-type="sec" rid="Ch1.S5"/> and <xref ref-type="sec" rid="Ch1.S5.SS4"/>).</p>

      <?xmltex \floatpos{t}?><fig id="Ch1.F2" specific-use="star"><caption><p id="d1e1283">Typical data access pattern in CMIP5 involved users making
local copies, and user groups making institutional-scale caches
from ESGF. Figure courtesy of Stephan Kindermann, DKRZ, adapted from
WIP Licensing White Paper.</p></caption>
        <?xmltex \igopts{width=369.885827pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018-f02.png"/>

      </fig>

      <p id="d1e1292">The proposed licensing policy removes the impossible task of license
enforcement from the distribution system, and embraces the “dark”
repositories and users. To quote the WIP position paper:</p>
      <p id="d1e1296"><disp-quote>
  <p id="d1e1299">The proposal is that (1) a data license be embedded in the data
files, making it impossible for users to avoid having a copy of the
license, and (2) the onus on defending the provisions of the license
be on the original modeling center …</p>
</disp-quote></p>
      <p id="d1e1303">Licenses will be embedded in all CMIP6 files, and all repositories, whether
sanctioned or “dark”, can be data sources, as seen below in the discussion
of replication (Sect. <xref ref-type="sec" rid="Ch1.S6"/>). In the embedded license approach,
modelling centres are offered two choices of Creative Commons
licenses: data covered by the Creative
Commons Attribution “ShareAlike” 4.0 International
License<fn id="Ch1.Footn26"><p id="d1e1308"><uri>http://creativecommons.org/licenses/by-sa/4.0/</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>
will be freely available; for centres with more
restrictive policies, the Creative
Commons Attribution “NonCommercial-ShareAlike” 4.0 International
License<fn id="Ch1.Footn27"><p id="d1e1315"><uri>http://creativecommons.org/licenses/by-nc-sa/4.0/</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> will limit use to non-commercial purposes.
Further sharing of the data is allowed, as the license travels with
the data. The PCMDI website provides a link to the current
CMIP6 Terms of Use webpage<fn id="Ch1.Footn28"><p id="d1e1322"><uri>https://pcmdi.llnl.gov/CMIP6/TermsOfUse</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>.</p>
</sec>
<sec id="Ch1.S5">
  <title>Citation, provenance, quality assurance, and documentation</title>
      <?pagebreak page3669?><p id="d1e1336">As noted in Sect. <xref ref-type="sec" rid="Ch1.S2"/>, citation requirements flow
from two underlying considerations: one, to provide proper credit and
formal acknowledgment of the authors of datasets; and the other, to
enable rigorous tracking of data provenance and data usage. The
tracking facilitates scientific reproducibility and traceability, as
well as enabling statistical analyses of dataset utility.<?xmltex \hack{\newpage}?></p>
      <p id="d1e1342">In addition to clearly identifying what data have been used in
research studies and who deserves credit for providing that data, it
is essential that the data be examined for quality and that
documentation be made available describing the model and experiment
conditions under which it was generated. These subjects are addressed
in the four position papers summarized in this section.</p>
      <p id="d1e1345">The principles outlined above are well-aligned with the
Joint Declaration of Data Citation
Principles<fn id="Ch1.Footn29"><p id="d1e1348"><uri>https://www.force11.org/group/joint-declaration-data-citation-principles-final</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> formulated by the Force11 (The Future of
Research Communications and e-Scholarship) consortium, which has acknowledged
the rapid evolution of digital scholarship and archiving, as well as the need
to update the rules of scholarly publication for the digital age. We are
convinced that not only peer-reviewed publications but also the data itself
should now be considered a first-class product of the research enterprise.
This means that data requires curation and should be treated with the same
care as journal articles. Moreover, most journals and academies now insist
that data used in the literature be made publicly available for independent
inquiry and reproduction of results. New services like
Scholix<fn id="Ch1.Footn30"><p id="d1e1355"><uri>http://www.scholix.org</uri> (last access: 17 August 2018)</p></fn> are evolving to support the exchange and access of
such data–data and data–literature interlinking.</p>
      <p id="d1e1361">Given the complexity of the CMIP6 data request, we expect a total
dataset count of <inline-formula><mml:math id="M18" display="inline"><mml:mrow><mml:mi mathvariant="script">O</mml:mi><mml:mo>(</mml:mo><mml:msup><mml:mn mathvariant="normal">10</mml:mn><mml:mn mathvariant="normal">6</mml:mn></mml:msup><mml:mo>)</mml:mo></mml:mrow></mml:math></inline-formula>. Because dozens of datasets are
typically used in a single scientific study, it is impractical to cite
each dataset individually in the same way as individual research
publications are acknowledged. Based on this consideration, there
needs to be a mechanism to cite data and give credit to data providers
that relies on a rather coarse granularity, while at the same time
offering another option at a much finer granularity for recording the
specific files and datasets used in a study.</p>
      <p id="d1e1382">In the following, two distinct types of persistent identifiers (PIDs) are
discussed: DOIs, which can only be assigned to data that comply with certain
standards for citation metadata and curation, and the more generic
“Handles”<fn id="Ch1.Footn31"><p id="d1e1385"><uri>https://www.dona.net/handle-system</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> that have fewer constraints and may be more easily
adapted for a particular use. The “Handle System”, as explained in
Sect. <xref ref-type="sec" rid="Ch1.S5.SS2"/> allows unique PIDs to be assigned to datasets at the
point of publication. Technically both types of PIDs rely on the underlying
Global Handle Registry to provide services (e.g. to resolve the PIDs and
provide associated metadata, such as the location of the data itself).</p>
<sec id="Ch1.S5.SS1">
  <title>Persistent identifiers for acknowledgment and citation</title>
      <p id="d1e1399">Based on earlier phases of CMIP, some datasets contributed to the
CMIP6 archive will be flawed (due, for example, to errors in
processing); therefore, they will not accurately represent a model's
behaviour. When errors are uncovered in the datasets, they may be
replaced with corrected versions. Similarly, additional datasets may
be added to an initially incomplete collection of datasets. Thus,
initially at least, the DOIs assigned for the purposes of citation and
acknowledgement will represent an evolving underlying collection of
datasets.</p>
      <p id="d1e1402">The recommendations, detailed in the
CMIP6 Data Citation and Long Term
Archival<fn id="Ch1.Footn32"><p id="d1e1405"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Data_Citation_LTA.pdf</uri>
(last access: 17 August 2018)</p></fn> position paper, recognize two phases to the
process of assigning DOI's to collections of datasets: an initial
phase, when the data have been released and preliminary community
analysis is underway and a second stage when most errors in the data
have been identified and corrected. Upon reaching stage two, the data
will be transferred to the long-term archive (LTA) of the IPCC Data
Distribution Centre (IPCC DDC) and deemed appropriate for
interdisciplinary use (e.g. in policy studies).</p>
      <p id="d1e1411">For evolving dataset aggregations, the data citation infrastructure relies on
information collected from the data providers and uses the
DataCite<fn id="Ch1.Footn33"><p id="d1e1414"><uri>https://www.datacite.org/dois.html</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> data infrastructure to assign DOIs and record
associated metadata. DataCite is a leading global non-profit organization
that provides persistent identifiers (DOIs) for research data. The DOIs will
be assigned to the following:
<list list-type="order"><list-item>
      <p id="d1e1423">aggregations that include all the datasets contributed by one
model from one institution from all of a single MIP's experiments,
and</p></list-item><list-item>
      <p id="d1e1427">smaller-size aggregations that include all datasets contributed
by one model from one institution generated in performing one
experiment (which might include one or more simulations).</p></list-item></list></p>
      <p id="d1e1430">These aggregations are dynamic as far as the PID infrastructure is
concerned: new elements can be added to the aggregation without
modifying the PID. As an example, for the coarser of the two
aggregations defined above, the same PID will apply to an evolving
number of simulations as new experiments are performed with the model.
This PID architecture is shown in Fig. <xref ref-type="fig" rid="Ch1.F3"/>. Since
these collections are dynamic, citation requires authors to provide a
version reference.</p>

      <?xmltex \floatpos{t}?><fig id="Ch1.F3" specific-use="star"><caption><p id="d1e1438">Schematic PID architecture, showing layers in the PID
hierarchy. In the lower layers of the hierarchy, PIDs are static
once generated, and new datasets generate new versions with new
PIDs. Each file carries a PID and each collection (dataset,
simulation, and so on) is related to a PID. Resolving the PID in the
Handle server guides the user to the file or the landing page
describing the collection. Each box in the figure will be
uniquely addressed by its PID.</p></caption>
          <?xmltex \igopts{width=369.885827pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018-f03.png"/>

        </fig>

      <p id="d1e1447">As an initial dataset matures and becomes stable, it is assigned a new
DOI. Before this is done, to meet formal requirements, the data
citation infrastructure requires some additional steps. First, we
ensure that there has been sufficient community examination of the
data (through citations in published literature, for instance) to
qualify it as having been peer-reviewed. Second, further steps are
undertaken to assure important information exists in ancillary
metadata repositories, including, for example, documentation (ES-DOC,
errata and citation) and to provide quality assurance of data and
metadata consistency and completeness (see Sect. <xref ref-type="sec" rid="Ch1.S5.SS3"/>). Once
these criteria have been satisfied, a DOI will be issued by the IPCC
DDC hosted by DKRZ. These dataset collections will meet the stringent
metadata and documentation requirements of the IPCC DDC. Since these
collections are static, no version reference is required in a
citation. Should errors be subsequently found, they will be corrected
in the data and published under a new DOI. The original DOI and its
related data are still available but are labelled as superseded with a
link recorded pointing to the corrected data.</p>
      <p id="d1e1452">For CMIP6, the initially assigned DOIs (associated with evolving
collections of data) must be used in research papers to properly give
credit to each of the modelling groups providing the data. Once a
stable collection of datasets has met the higher standards for
long-term curation and quality, the DOI assigned by the IPCC DDC
should be used instead. The data citation approach is described in
greater detail in <xref ref-type="bibr" rid="bib1.bibx39" id="text.54"/>.</p>
</sec>
<?pagebreak page3670?><sec id="Ch1.S5.SS2">
  <title>Persistent identifiers for tracking, provenance, and
curation</title>
      <p id="d1e1464">Although the DOIs assigned to relatively large aggregations of
datasets are well suited for citation and acknowledgment purposes,
they are not issued at fine enough granularity to meet the scientific
imperative that published results should be traceable and verifiable.
Furthermore, management of the CMIP6 archive requires that PIDs be
assigned at a much finer granularity than the DOIs. For these
purposes, PIDs recognized by the Global Handle Registry will be
assigned at two different levels of granularity: one per file and one
per dataset.</p>
      <p id="d1e1467">A unique Handle will be generated each time a new CMIP6 data file is created,
and the Handle will be recorded in the file's metadata (in the form of a
netCDF global attribute named <monospace>tracking_id</monospace>). At the time the data is
published, the <monospace>tracking_id</monospace> will be processed by the CMIP6 Handle
service infrastructure and recorded in the ESGF metadata catalog. Another
Handle will subsequently be assigned at a somewhat coarser granularity to each
aggregation of files containing the data from a single variable sampled at a
single frequency from a single model running a single experiment. In ESGF
terminology, this collection of files is referred to as an “atomic
dataset”.</p>
      <p id="d1e1476">As described in the CMIP6 Persistent Identifiers Implementation
Plan<fn id="Ch1.Footn34"><p id="d1e1479"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_PID_Implementation_Plan.pdf</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> position paper, a Handle assigned at either of
these two levels of the PID hierarchy identifies a static entity; if
any file associated with a Handle is altered in any way a new Handle
must be created. The PID infrastructure is also central to the
replication and versioning strategies, as described in
Sects. <xref ref-type="sec" rid="Ch1.S6"/> and <xref ref-type="sec" rid="Ch1.S7"/> below.
Furthermore, as a means of recording provenance and enabling tracking
of dataset usage, authors are urged to include a PID list (a flat
list of all PIDs referenced) attached to each CMIP6-based publication as supplementary
material.</p>

      <?xmltex \floatpos{t}?><fig id="Ch1.F4" specific-use="star"><caption><p id="d1e1492">PID workflow, showing the generation and registry of PIDs,
with checkpoints where compliance is assured.</p></caption>
          <?xmltex \igopts{width=369.885827pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018-f04.png"/>

        </fig>

      <p id="d1e1502">The implementation plan describes methods for generating and
registering Handles using an asynchronous messaging system known as
RabbitMQ. This system, designed in collaboration with ESGF developers
and shown in Fig. <xref ref-type="fig" rid="Ch1.F4"/>, guarantees, for example, that
PIDs are correctly generated in accordance with the versioning
guidelines. The CMIP6 Handle system builds on the idea of tracking-ids
used in CMIP5, but with a more rigorous quality control to ensure that
new PIDs are generated when data are modified. The dataset and file
Handles are also associated with basic metadata, called PID kernel
information <xref ref-type="bibr" rid="bib1.bibx43" id="paren.55"/>, which facilitate the recording
of basic provenance information. Datasets and files point to each
other to bind the granularities together. In addition, dataset kernel
information refers to previous and later versions, errata information,
and replicas, as explained in more detail in the position paper.</p>
</sec>
<sec id="Ch1.S5.SS3">
  <title>Quality assurance</title>
      <?pagebreak page3671?><p id="d1e1516">Quality assurance (QA) encompasses the entire data life cycle, as
depicted in Fig. <xref ref-type="fig" rid="Ch1.F5"/>. At all stages, a goal is to capture
provenance information that will enable scientific reproducibility.
Further, as noted in Item 2 in Sect. 2.2,
the QA procedures should uncover issues that might undermine trust in
the data by those outside the Earth system modelling community if
errors were left unreported.</p>

      <?xmltex \floatpos{t}?><fig id="Ch1.F5" specific-use="star"><caption><p id="d1e1523">Schematic of the phases of quality assurance, displaying earlier
stages in the hands of modelling centres (left), and more formal
long-term data curation stages (right). Quality assurance is
applied both to the data (D, above) as well as the metadata (M)
describing the data. Figure drawn from the WIP's Quality Assurance
position paper.</p></caption>
          <?xmltex \igopts{width=369.885827pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018-f05.png"/>

        </fig>

      <p id="d1e1532">QA must ensure that the data and metadata correctly reflect a model's
simulation, so that it can be reliably used for scientific purposes. As
depicted in Fig. <xref ref-type="fig" rid="Ch1.F5"/>, the first stage of QA is the responsibility
of the data producer: in fact the cycle of model development and diagnosis is
the most critical element of QA. The second aspect is ensuring that
disseminated data include common metadata based on common CVs, which will
enable consistent treatment of data from different groups and institutions.
These requirements are directly embedded in the ESGF publishing process and
in tools such as CMOR<fn id="Ch1.Footn35"><p id="d1e1537"><uri>https://cmor.llnl.gov/</uri> (last access: 17 August 2018)</p></fn> (and its validation component,
PrePARE<fn id="Ch1.Footn36"><p id="d1e1543"><uri>https://cmor.llnl.gov/mydoc_cmip6_validator/</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>). These checks (the D1 and M1 phases of QA in
Fig. <xref ref-type="fig" rid="Ch1.F5"/>) ensure that the data conform to the CMIP6 data request
specifications, conform to all naming conventions and CVs, and follow the
mandated structure for organization into a common directory structure. As
noted in Sect. <xref ref-type="sec" rid="Ch1.S3"/>, many modelling centres have chosen to embed
these steps directly in their workflows to ensure conformance with the CMIP6
requirements as the models are being run and their output processed.</p>
      <p id="d1e1554">At this point, as noted in Fig. <xref ref-type="fig" rid="Ch1.F5"/>, control is ceded to
the ESGF system, where designated QA nodes (ESGF data nodes where
additional services are turned on) perform further QA checks to
certify data is suitable for citation and long-term archiving. A
critical step is the assignment of PIDs (Sect. <xref ref-type="sec" rid="Ch1.S5.SS2"/>, the D2
stage of Fig. <xref ref-type="fig" rid="Ch1.F4"/>), which is more controlled than in
CMIP5 and guarantees that across the data life cycle, the PIDs will be
reliably useful as unique labels of datasets.</p>
      <p id="d1e1564">Beyond this, further stages of QA will be handled within the ESGF
system following procedures outlined in the
CMIP6 Quality
Assurance<fn id="Ch1.Footn37"><p id="d1e1567"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Quality_Assurance.pdf</uri>
(last access: 17 August 2018)</p></fn> position paper. As previously described, once data
have been published, the data will be scrutinized by researchers in
what can be considered an ongoing period of community-wide scientific
QA of the data. During this period, modelling centres may correct
errors and provide new versions of datasets. In the final stage, the
data pass into the long term archive (LTA) status, described as the
“bibliometric” phase in Fig. <xref ref-type="fig" rid="Ch1.F5"/>. Just prior to LTA, the
system will verify the minimum standards of provenance documentation. This
is described in the next section.</p>
</sec>
<?pagebreak page3672?><sec id="Ch1.S5.SS4">
  <title>Documentation of provenance</title>
      <p id="d1e1581">As noted earlier in Sect. <xref ref-type="sec" rid="Ch1.S3"/>, for data to become a
first-class scientific resource, the methods of their production must
be documented to the fullest extent possible. For CMIP6, this includes
documenting both the models and the experiments. While traditionally
this is done through peer-reviewed literature, which remains
essential, we note that to facilitate various aspects of search,
discovery, and tracking of datasets, there is an additional need for
structured documentation in machine readable form.</p>

      <?xmltex \floatpos{t}?><fig id="Ch1.F6" specific-use="star"><caption><p id="d1e1588">Elements of ES-DOC documentation. Rows indicate phases of
the modelling process being documented, and box colours indicate the
parties responsible for producing the documentation (see legend).
Figure courtesy of Guillaume Levavasseur, IPSL.</p></caption>
          <?xmltex \igopts{width=341.433071pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018-f06.pdf"/>

        </fig>

      <p id="d1e1597">In CMIP6, the documentation of experiments, models, and
simulations is done through the Earth System Documentation
<xref ref-type="bibr" rid="bib1.bibx21" id="paren.56"><named-content content-type="pre">ES-DOC
<fn id="Ch1.Footn38"><p id="d1e1603"><uri>https://www.earthsystemcog.org/projects/es-doc-models/</uri>
(last access: 17 August 2018)</p></fn>,</named-content></xref> project. The
various aspects of model documentation are shown in
Fig. <xref ref-type="fig" rid="Ch1.F6"/>, and in greater detail in the WIP position
paper on
ES-DOC<fn id="Ch1.Footn39"><p id="d1e1613"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_ESDOC_documentation.pdf</uri>
(last access: 17 August 2018)</p></fn>. The CMIP6 experimental design has been
translated into structured text documents, already available from ES-DOC.
ES-DOC has constructed CVs for the description of the CMIP6 standard model
realms (CMIP terminology for climate subsystems, such as “ocean” or
“atmosphere”), including a set of short tables (specializations,
in ES-DOC terminology) for each realm. The specializations are a succinct and
structured description of the model physics. Ideally, modelling groups would
integrate their provision of documentation to ES-DOC with their model development process. This would better ensure the accuracy and
consistency of the documentation. ES-DOC provides a variety of user
interfaces to read and write structured documentation that conforms with the
Common Information Model (CIM) of <xref ref-type="bibr" rid="bib1.bibx30" id="text.57"/>. As models
evolve or differentiate (for example, an Earth system model derived from a
particular physics-only general circulation model), branches and new versions
of the documentation can be produced, and it will be possible to display,
annotate, and add new entries in the genealogy of a model in a manner
familiar to anyone who works with version control software like git.</p>
      <?pagebreak page3673?><p id="d1e1622">A critical element in the ES-DOC process is the documentation of
conformances: steps undertaken by the modelling centres to ensure
that the simulation was conducted as called for by the experiment design. It
is here that the input datasets used in a simulation are documented
<xref ref-type="bibr" rid="bib1.bibx12" id="paren.58"><named-content content-type="pre">e.g. the version of each of the forcing datasets,
see</named-content></xref>. The conformances will be an important element in
guiding the selection of subsets of CMIP6 model results for particular research
studies. A researcher might, for example, choose to sub-select only those
models that used a particular version of the forcing datasets that are
imposed as part of the experimental protocol. The conformances will continue
to grow in importance under the CMIP vision that the DECK will provide an
ongoing foundation on which to build a series of future CMIP phases
<xref ref-type="bibr" rid="bib1.bibx14" id="paren.59"><named-content content-type="pre">shown schematically in Fig. 1 of</named-content></xref>. The
conformances will be essential in enabling studies across model generations.</p>
      <p id="d1e1636">The method of capturing the conformance documentation is a two-stage
process that has been designed to minimize the amount of work required
by a modelling centre. The first stage is to capture the many
conformances common to all simulations. ES-DOC will then automatically
copy these common conformances to multiple simulations, thereby
eliminating duplicated effort. This is followed by a second stage in
which those conformances that are specific to individual experiments
or simulations are collected.</p>
      <p id="d1e1639">While this method of documentation is unfamiliar to many, such methods
are likely to become common and required practice in the maturing
digital age as part of best scientific practices. Documentation of
software validation <xref ref-type="bibr" rid="bib1.bibx36" id="paren.60"><named-content content-type="pre">see e.g.</named-content></xref> and structured
documentation of complete scientific workflows that can be
independently read and processed are both becoming more common
<xref ref-type="bibr" rid="bib1.bibx10" id="paren.61"><named-content content-type="pre">see the special issue on the “Geoscience Paper of the
Future”,</named-content></xref>. We previously noted (see
Item 3 in Sect. 2.2) the special importance of documenting how results
have been obtained and enabling results to be reproduced by others in modern-day climate research.
Rigorous documentation remains a hardy bulwark against challenges to
the scientific process.</p>
      <p id="d1e1652">In keeping with the “dataset-centric rather than system-centric”
approach (Item 7 in Sect. 2.2), a user
will be directly linked to documentation from each dataset. This is
done in CMIP6 by adding a required global attribute
<monospace>further_info_url</monospace> in file headers pointing to the associated
CIM document, which will serve as the landing page for documentation
from which further exploration (by humans or software) will take
place. The form of this URL is standard and can be software-generated:
CMOR, for instance, will automatically add it. The existence and
functioning of the landing page is assured in Stage M3 of
Fig. <xref ref-type="fig" rid="Ch1.F5"/>.</p>
</sec>
</sec>
<sec id="Ch1.S6">
  <title>Replication</title>
      <p id="d1e1667">The replication strategy is covered in the
CMIP6 Replication and
Versioning<fn id="Ch1.Footn40"><p id="d1e1670"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Replication_and_Versioning.pdf</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> position paper. The recommendations therein are
based on the following primary goal:
<list list-type="bullet"><list-item>
      <?pagebreak page3674?><p id="d1e1679">ensuring at least one copy of a dataset is present at a stable
ESGF node with a mission of long-term maintenance and curation of
data. The total data storage resources planned across the Tier 1
nodes in the CMIP6 era is adequate to support this requirement,
although some data will likely be held on accessible tape storage
rather than spinning disk.</p></list-item></list></p>
      <p id="d1e1682">In addition, we have articulated a number of secondary goals:
<list list-type="bullet"><list-item>
      <p id="d1e1687">enhancing data accessibility across the ESGF (e.g. Australian
data easily accessible to the European continent despite the long
distance);</p></list-item><list-item>
      <p id="d1e1691">enabling each Tier 1 data node to enact specific policies to
support their local objectives;</p></list-item><list-item>
      <p id="d1e1695">ensuring that the most widely requested data is accessible from
multiple ESGF data nodes (of course, any dataset will be available
at least on its original publication data node);</p></list-item><list-item>
      <p id="d1e1699">enabling large-scale data analysis across the federation (see
Item 4 in Sect. <xref ref-type="sec" rid="Ch1.S2"/>);</p></list-item><list-item>
      <p id="d1e1705">ensuring continuity of data access in the event of individual
node failures;</p></list-item><list-item>
      <p id="d1e1709">enabling network load-balancing and enhanced performance;</p></list-item><list-item>
      <p id="d1e1713">reducing the manual workload related to replication;</p></list-item><list-item>
      <p id="d1e1717">and building a reliable replication mechanism that can be used not
only within the federation, but by the secondary repositories
created by user groups (see discussion in
Sect. <xref ref-type="sec" rid="Ch1.S4"/> around Fig. <xref ref-type="fig" rid="Ch1.F2"/>).</p></list-item></list></p>
      <p id="d1e1724">In conjunction with the ESGF and the International Climate Networking
Working Group (ICNWG), these recommendations have been translated to
two options for replication.</p>
      <p id="d1e1727">The basic toolchain for replication is built on updated versions of the
software layers used in CMIP5 including the following:
synda<fn id="Ch1.Footn41"><p id="d1e1730"><uri>https://github.com/Prodiguer/synda</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> (formerly synchrodata) and Globus Online
<xref ref-type="bibr" rid="bib1.bibx7" id="paren.62"/>, which are based on underlying data transport
mechanisms such as
gridftp<fn id="Ch1.Footn42"><p id="d1e1740"><uri>http://toolkit.globus.org/toolkit/docs/latest-stable/gridftp/</uri>
(last access: 17 August 2018)</p></fn> and the older and now deprecated protocols like
wget and ftp.</p>
      <p id="d1e1747">As one option, these layers can be used for ad hoc replication by
sites or user groups. For ad hoc replication, there is no obvious
mechanism for triggering updates or replication when new or corrected data
are published (or retracted, see Sect. <xref ref-type="sec" rid="Ch1.S7"/> below). As a
second option, certain designated nodes (replica nodes) will
maintain a protocol for automatic replication, shown in
Fig. <xref ref-type="fig" rid="Ch1.F7"/>.</p>

      <?xmltex \floatpos{t}?><fig id="Ch1.F7" specific-use="star"><caption><p id="d1e1756">CMIP6 replication from data nodes to replica centres and
between replica centres coordinated by a CMIP6 replication team,
under the guidance of the CDNOT.</p></caption>
        <?xmltex \igopts{width=341.433071pt}?><graphic xlink:href="https://gmd.copernicus.org/articles/11/3659/2018/gmd-11-3659-2018-f07.png"/>

      </fig>

      <p id="d1e1765">Given the nature of some of the secondary goals listed above, it would
not be appropriate to prescribe which data should be replicated by
each centre. Rather, the plan should be flexible to accommodate
changing data use profiles and resource availability. A replication
team under the guidance of the CDNOT will coordinate the replication
activities of the CMIP6 data nodes such that the primary goal is
achieved and an effective compromise for the secondary goals is
established.</p>
      <p id="d1e1768">The International Climate Network Working Group (ICNWG), formed under
the Earth System Grid Federation (ESGF), helps set up and optimize
network infrastructures for ESGF climate data sites located around the
world. For example, prioritizing the most widely requested data for
replication can best be done based on operational experience and will
of course change over time. To ensure that the replication strategy is
responding to user need and data node capabilities, the replication
team will maintain and run a set of monitoring and notification tools
assuring that replicas are up to date. The CDNOT is tasked with
ensuring the deployment and smooth functioning of replica nodes.</p>
      <p id="d1e1771">A key issue that emerged from discussions with node managers is that
the replication target has to be of sustainable size. A key finding is
that a replication target about 2 PB in size is the practical
(technical and financial) limit for CMIP6 online (disk) storage at any
single location. Replication beyond this may involve offline storage
(tape) for disaster recovery.</p>
      <p id="d1e1775">Based on experience in CMIP5, it is expected that a number of
“special interest” secondary repositories will hold selected subsets
of CMIP6 data outside of the ESGF federation. This will have the
effect of widening data accessibility geographically, and by user
communities, with obvious benefit to the CMIP6 project. These
secondary repositories will be encouraged and supported where it does
not undermine CMIP6 data management and integrity objectives.</p>
      <p id="d1e1778">In the new dataset-centric approach, licenses and PIDs remain embedded
and will continue to play their roles in the data toolchain even for
these secondary repositories.</p>
      <p id="d1e1781">In CMIP5 a significant issue for users of some third-party archives
was that their replicated data was taken as a one-time snapshot (see
discussion above in Item 7 in Sect. <xref ref-type="sec" rid="Ch1.S2"/>),
and not updated as new versions of the data were submitted to the
source ESGF node. Tools have been developed by a number of
organizations to maintain locally synchronized archives of CMIP5 data
and third-party providers should be encouraged to make use of these
types of tools to keep the local archives up to date.</p>
      <p id="d1e1786">In summary, the requirements for replication are limited to ensuring
<list list-type="bullet"><list-item>
      <p id="d1e1791">that within a reasonably short time period following submission,
there is at least one instance of each submitted dataset stored at a
Tier 1 node (in addition to its primary residence);</p></list-item><list-item>
      <p id="d1e1795">that subsequent versions of submitted datasets are also
replicated by at least one Tier 1 node (see versioning discussion
below in Sect. <xref ref-type="sec" rid="Ch1.S7"/>);</p></list-item><list-item>
      <p id="d1e1801">that creators of secondary repositories take advantage of the
replication toolchain described here, to maintain replicas that can
be kept up to date, and inform local users of dataset retractions
and corrections;</p></list-item><list-item>
      <p id="d1e1805">that the CDNOT is the recognized body to manage the operational
replication strategy for CMIP6.</p></list-item></list></p>
      <p id="d1e1808">We note that the ESGF PID registration service is part of the ESGF
data publication implementation and not exclusive to CMIP6, and is now
in use by the input4MIPs and obs4MIPs projects. The PID registration
service works for all NetCDF-CF files that carry a PID as
<monospace>tracking_id</monospace> field. This is agreed for all CMIP6 data files.
However, the ESGF PID registration service is not exclusively
applicable for CMIP6 model data files but can also be used for derived
data sets (e.g. subsets or averages) as long as the data are in
NetCDF-CF format with a PID from the Handle service in the
<monospace>tracking_id</monospace>. Once the data are processed by the ESGF PID
registration service, these files may easily be used to
create collections in the PID hierarchy as given in
Fig. <xref ref-type="fig" rid="Ch1.F3"/>. In general all files as digital objects can
be assigned a PID and registered in the CNRI Handle server. Vice
versa, these objects (files) can be uniquely resolved by the Handle
server providing the PID is known. That means the PID service allows
for stable and transparent data access independently from the actual
storage location. The storage location is part of the PID metadata
which are integrated in the Handle server. The PID metadata
generation and registration is part of the ESGF registration service
for NetCDF-CF files but in general the PID architecture is not
restricted to them. It is open for all digital objects.</p>
      <p id="d1e1819">Thus, CMIP6 is the first implementation of the PID service in a larger
data project and ESGF provides, in parallel, the classical data access
via the data reference syntax outlined in the CMIP6 Global Attributes, DRS, Filenames, Directory Structure, and
CVs<fn id="Ch1.Footn43"><p id="d1e1822"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_global_attributes_filenames_CVs_v6.2.6.pdf</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> position paper.</p>
</sec>
<?pagebreak page3675?><sec id="Ch1.S7">
  <title>Versioning</title>
      <p id="d1e1835">The versioning strategy for CMIP6 datasets (see the
CMIP6 Replication and
Versioning<fn id="Ch1.Footn44"><p id="d1e1838"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Replication_and_Versioning.pdf</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn> position paper) is designed to enable
reproduction of scientific results (Sect. <xref ref-type="sec" rid="Ch1.S2"/>).
Recognizing that errors may be found after datasets have been distributed,
erroneous datasets that may have been used downstream will continue to be
publicly available but marked as superseded. This will allow users to trace
the provenance of published results even if those point to retracted data and
will further allow the possibility of a posteriori correction of
such results.</p>
      <p id="d1e1847">A consistent versioning methodology across all the ESGF data nodes is
required to satisfy these objectives. We note that inconsistent or
informal versioning practices at individual nodes would likely be
invisible to the ESGF infrastructure (e.g. yielding files that look
like replicas, but with inconsistent data and checksums), which would
inhibit traceability across versions.</p>
      <p id="d1e1850">Building on the replication strategy and on input from the ESGF
implementation teams, versioning will leverage the PID infrastructure
of Sect. <xref ref-type="sec" rid="Ch1.S5"/>. PIDs are permanently associated with a
dataset, and new versions will get a new PID. When new versions are
published, there will be two-way links created within the PID kernel
information so that one may query a PID for prior or subsequent
versions.</p>
      <p id="d1e1855">A version number will be assigned to each atomic dataset: a complete
time series of one variable from one experiment and one model. The implication
is that if an error is found in a single variable, other variables produced
from the simulation need not be republished. If an entire experiment is
retracted and republished, all variables will get a consistent version
number. The CDNOT will ensure consistent versioning practices at all
participating data nodes.</p>
<?pagebreak page3676?><sec id="Ch1.S7.SS1">
  <title>Errata</title>
      <p id="d1e1864">In particular, it is worth highlighting the new recommendations
regarding errata. Until CMIP5, we relied on the ESGF system to
push notifications to registered users regarding retractions and
reported errors. This was found to result in imperfect coverage: as
noted in Sect. <xref ref-type="sec" rid="Ch1.S4"/>, a substantial fraction of users
are invisible to the ESGF system. Therefore, following the discussion
in Sect. <xref ref-type="sec" rid="Ch1.S2"/> (see Item 7), we
recommended a design which is dataset-centric rather than
system-centric. Notifications are no longer pushed to users; rather
they will be able to query the status of a dataset they are working
with (e.g. ES-DOC Dataset
Errata search<fn id="Ch1.Footn45"><p id="d1e1871"><uri>https://errata.es-doc.org/static/index.html</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>).
An “errata client” will allow the user to enter a PID to query its
status; and an “errata server” will return the PIDs associated with
prior or posterior versions of that dataset, if any. Details are to be found
in the
Errata<fn id="Ch1.Footn46"><p id="d1e1878"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Errata_System.pdf</uri>
(last access: 17 August 2018)</p></fn> position paper.</p>
</sec>
</sec>
<sec id="Ch1.S8" sec-type="conclusions">
  <title>The future of the global data infrastructure</title>
      <p id="d1e1891">The WIP was formed in response to the explosive growth of CMIP between
CMIP3 and CMIP5, and it is charged with studying and making
recommendations about the global data infrastructure needed to support
CMIP6 and subsequent similar WCRP activities as they are established
and evolve. Our findings reflect the fact that CMIP is no longer a
cottage industry, and a more formal approach is needed. Several of the
findings have been translated into requirements on the design of the
underlying software infrastructure for data production and
distribution. We have separated infrastructure development into
requirements, implementation, and operations phases, and we have
provided recommendations on the most efficient use of scarce
resources. The resulting recommendations stop well short of any sort
of global governance of this “vast machine”, but address many areas
where, with a relatively light touch, beneficial order, control, and
resource efficiencies result.</p>
      <p id="d1e1894">One key finding that informs everything is that it appears that the
critical importance of such infrastructure is under-appreciated.
Building infrastructure using research funds puts the system in an
untenable position, with a fundamental contradiction at its heart:
infrastructure by its nature should be reliable, robust, based on what
is proven to work, and invisible, whereas scientific research is
hypothesis-driven, risky, and novel, and its results are widely broadcast.
While recommendations have been made at the highest level advocating
remedies <xref ref-type="bibr" rid="bib1.bibx34" id="paren.63"><named-content content-type="pre">e.g.</named-content></xref>, there is little progress to report on
this front. Several of the key pieces of infrastructure
software described here are built and tested by volunteers or
short-term project staff.</p>
      <p id="d1e1902">The central theme of this paper is the inversion of the design of federated
data distribution, to make it dataset-centric rather
than system-centric. We believe that this one aspect of the design
considerably reduces systemic risk, and allows the size of the system
to scale up and down as resource constraints allow. Individual
scientists or institutions or consortia, will be able to pool
resources and share data at will, with relatively light requirements
related to licensing (Sect. <xref ref-type="sec" rid="Ch1.S4"/>) and dataset
tracking (Sect. <xref ref-type="sec" rid="Ch1.S5.SS2"/>). This relieves a considerable design
burden from the ESGF software stack, and further, recognizes that the
data ecosystem extends well beyond the reach of any software system
and that data will be used and reused in a myriad of ways outside
anyone's control.</p>
      <p id="d1e1909">A second key element of the design is the insistence on
machine-readable experimental protocols. Standards, conventions, and
vocabularies are now stored in machine-readable structured text formats like
XML and JSON, thereby enabling software to automate aspects of the process.
This meets an existing urgent need, with some modelling centres already
exploiting this structured information to mitigate against the overwhelming
complexity of experimental protocols. Moreover, this will also enable and
encourage unanticipated future use of the information in developing new
software tools for exploiting it as technologies evolve. Our ability to
predict (whether correctly or not remains to be seen) the expected CMIP6 data
volume is one such unexpected outcome.</p>
      <p id="d1e1913">Finally, the infrastructure allows user communities to assess the
costs of participation as well as the benefits. For example, we
believe the new PID-based methods of dataset<?pagebreak page3677?> tracking will allow centres to
measure which data has value downstream. The importance of citations and fair
credit for data providers is recognized with a design that facilitates and
encourages proper citation practices. Tools have been added and made
available that allow centres, and the CMIP itself, to estimate the data
requirements of each experimental protocol. Ancillary activities such as
CPMIP add to this an accounting of the computational burden of CMIP6.</p>
      <p id="d1e1916">Certainly not all issues are resolved, and the validation of some of
our findings will have to await the outcome of CMIP6. There is no
community consensus on some proposed design elements, such as standard
grids. Some features long promised, such as server-side analytics
(“bringing analysis to the data”) are yet to become fully mature,
although many exciting efforts are underway, for instance early
investigations at using cloud technologies, both for data storage and
analysis (see discussion above, Item 4 in
Sect. <xref ref-type="sec" rid="Ch1.S2.SS2"/>). The ESGF Compute Working Team is
also working on a set of requirements and “certification”
guidelines<fn id="Ch1.Footn47"><p id="d1e1921"><uri>https://docs.google.com/document/d/1c5KXC0ZfFr1Iko6syhqlS5kWGCnrCqcVsWRU1LHpwG8/edit</uri>
(last access: 17 August 2018)</p></fn> for provisioning computing close to the data.
Nevertheless, the discussion in this article provides a sound basis
for beginning to think about the future.</p>
      <p id="d1e1927">The future brings with it new challenges. First among these is an
expansion of the data ecosystem. There is an increasing blurring of
the boundary between weather and climate as time and space scales
merge <xref ref-type="bibr" rid="bib1.bibx24" id="paren.64"/>. This will increasingly entrain new
communities into climate data ecosystems, each with their own
modelling and analysis practices, standards and conventions, and other
issues. The establishment of the WIP was a crucial step in enhancing
the capabilities, standards, protocols, and policies around the CMIP
enterprise. Earlier discussions on the scope of the WIP also suggested
a broader scope for the panel on the longer-term, to coordinate not
only the model intercomparison activities (including for example, the
CORDEX project <xref ref-type="bibr" rid="bib1.bibx29" id="paren.65"/>, which also relies upon ESGF
for data dissemination) but also the climate prediction (seasonal to
decadal) issues and corresponding observational and reanalysis
aspects. We would recommend a closer engagement between these
communities in planning the future of a seamless global data
infrastructure, to better leverage infrastructure investments and
effort.<?xmltex \hack{\newpage}?></p>
      <p id="d1e1937">A further challenge the WIP and the community must grapple with is the
evolution of scientific publication in the digital age, beyond the
peer-reviewed paper. We have noted above that the nature of
publication is changing <xref ref-type="bibr" rid="bib1.bibx10" id="paren.66"><named-content content-type="pre">see e.g</named-content></xref>. Journals
and academies increasingly insist upon transparency with respect to
codes and data to ensure reproducibility. In the future, datasets and
software with provenance information will be first-class entities of
scientific publication, alongside the traditional peer-reviewed
article. In fact it is likely that those will be increasingly featured
in the grey literature and scientific social media: one can imagine
blog posts and direct annotations on the published literature around
CMIP6 utilizing analyses directly performed on datasets using their PIDs.
Data analytics at the large scale is increasingly moving toward machine
learning and other directly data-driven methods of analysis, which
will also be dependent on data labelled with machine-readable
metadata. Our community needs to pay increasing heed to the status of
their data, metadata, and software in the light of these developments.</p>
      <p id="d1e1945">Future development of the WIP's activities beyond the delivery of
CMIP6 will include an analysis of how the infrastructure design
performed during CMIP6. That analysis, combined with our assessment of
technological change and emerging novel applications, will inform the
future design of infrastructure software, as well as recommendations
to the designers of experiments on how best to fit their protocols
within resource limitations. The vision, as always, is for an open
infrastructure that is reliable and invisible, and allows Earth system
scientists to be nimble in the design of collaborative experiments,
creative in their analysis, and rapid in the delivery of results.</p>
</sec>

      
      </body>
    <back><notes notes-type="codedataavailability">

      <p id="d1e1952">The software and data used for the study of data compression are
available at the deflation study
website<fn id="Ch1.Footn48"><p id="d1e1955"><uri>https://public.tableau.com/profile/balticbirch#!/vizhome/NC4/NetCDF4Deflation</uri>
(last access: 17 August 2018)</p></fn>, courtesy of Garrett Wright.</p>

      <p id="d1e1961">The software and data used for the prediction of data volumes
are available at the dreqDataVol
page<fn id="Ch1.Footn49"><p id="d1e1964"><uri>https://www.earthsystemcog.org/site_media/projects/wip/dreqDataVol.py</uri>
(last access: 17 August 2018)</p></fn>, courtesy of Nalanda Sharadjaya. Much of this
functionality has now been absorbed into DREQ itself.</p>

      <p id="d1e1970">Most of the software referenced here for which the WIP is providing
design guidelines and requirements, but not implementation, including
the ESGF, ESDOC, and DREQ software stacks are open source and freely
available. They are autonomous projects and, therefore, not listed here.</p>
  </notes><?xmltex \hack{\clearpage}?><app-group>

<?pagebreak page3678?><app id="App1.Ch1.S1">
  <title>List of WIP position papers</title>
      <p id="d1e1982"><list list-type="bullet">
          <list-item>

      <p id="d1e1987">CDNOT Terms of
Reference<fn id="App1.Ch1.Footn1"><p id="d1e1990"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CDNOT_Terms_of_Reference.pdf</uri>
(last access: 17 August 2018)</p></fn>: a charter for the CMIP6 Data Node
Operations Team. Authorship: WIP.</p>
          </list-item>
          <list-item>

      <p id="d1e1999">CMIP6 Global Attributes, DRS, Filenames, Directory Structure, and
CVs<fn id="App1.Ch1.Footn2"><p id="d1e2002"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_global_attributes_filenames_CVs_v6.2.6.pdf</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>: conventions and controlled vocabularies for
consistent naming of files and variables. Authorship: Karl E. Taylor, Martin Juckes,
Venkatramani Balaji, Luca Cinquini, Sébastien Denvil,
Paul J. Durack, Mark Elkington, Eric Guilyardi, Slava Kharin,
Michael Lautenschlager, Bryan Lawrence, Denis Nadeau, and Martina Stockhause, and the WIP.</p>
          </list-item>
          <list-item>

      <p id="d1e2012">CMIP6 Persistent Identifiers Implementation
Plan<fn id="App1.Ch1.Footn3"><p id="d1e2015"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_PID_Implementation_Plan.pdf</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>: a system of identifying and citing datasets
used in studies, at a fine grain. Authorship: Tobias Weigel, Michael Lautenschlager, Martin Juckes and the WIP.</p>
          </list-item>
          <list-item>

      <p id="d1e2025">CMIP6 Replication and
Versioning<fn id="App1.Ch1.Footn4"><p id="d1e2028"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Replication_and_Versioning.pdf</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>: a system for ensuring reliable and
verifiable replication; tracking of dataset versions, retractions,
and errata. Authors: Stephan Kindermann, Sebastien Denvil and the
WIP.</p>
          </list-item>
          <list-item>

      <p id="d1e2038">CMIP6 Quality
Assurance<fn id="App1.Ch1.Footn5"><p id="d1e2041"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Quality_Assurance.pdf</uri>
(last access: 17 August 2018)</p></fn>: systems for ensuring data compliance with
rules and conventions listed above. Authorship: Frank Toussaint,
Martina Stockhause, Michael Lautenschlager and the WIP.</p>
          </list-item>
        </list><?xmltex \hack{\newpage}?><list list-type="bullet">
          <list-item>

      <p id="d1e2053">CMIP6 Data Citation and Long Term
Archival<fn id="App1.Ch1.Footn6"><p id="d1e2056"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Data_Citation_LTA.pdf</uri>
(last access: 17 August 2018)</p></fn>: a system for generating Document Object
Identifies (DOIs) to ensure long-term data curation. Authorship:
Martina Stockhause, Frank Toussaint, Michael Lautenschlager, Bryan Lawrence and the WIP.</p>
          </list-item>
          <list-item>

      <p id="d1e2065">CMIP6 Licensing and Access Control<fn id="App1.Ch1.Footn7"><p id="d1e2068"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Licensing_and_Access_Control.pdf</uri>
<?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>: terms of use and licences to use data.
Authorship: Bryan Lawrence and the WIP.</p>
          </list-item>
          <list-item>

      <p id="d1e2079">CMIP6 ESGF Publication
Requirements<fn id="App1.Ch1.Footn8"><p id="d1e2082"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_ESGF_Publication_Requirements.pdf</uri><?xmltex \hack{\break}?>(last access: 17 August 2018)</p></fn>: linking WIP specifications to the ESGF
software stack, conventions that software developers can build
against. Authorship: Martin Juckes and the WIP.</p>
          </list-item>
          <list-item>

      <p id="d1e2092">Errata System for
CMIP6<fn id="App1.Ch1.Footn9"><p id="d1e2095"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_Errata_System.pdf</uri>
(last access: 17 August 2018)</p></fn>: a system for tracking and discovery of
reported errata in the CMIP6 system. Authorship: Guillaume Levavasseur, Sébastien Denvil, Atef Ben Nasser, and the WIP.</p>
          </list-item>
          <list-item>

      <p id="d1e2104">ESDOC
Documentation<fn id="App1.Ch1.Footn10"><p id="d1e2107"><uri>https://www.earthsystemcog.org/site_media/projects/wip/CMIP6_ESDOC_documentation.pdf</uri>
(last access: 17 August 2018)</p></fn>: An overview of the process for providing
structured documentation of the models, experiments and simulations
that produce the CMIP6 output datasets. Authorship: the ES-DOC Team.</p>
          </list-item>
        </list><?xmltex \hack{\clearpage}?></p>
</app>
  </app-group><notes notes-type="authorcontribution">

      <p id="d1e2121">All of the authors participated in the development of the paper's findings and
recommendations.</p>
  </notes><notes notes-type="competinginterests">

      <p id="d1e2127">The authors declare that they have no conflict of
interest.</p>
  </notes><ack><title>Acknowledgements</title><p id="d1e2133">We thank Michel Rixen, Stephen Griffies, John Krasting, and three anonymous reviewers for their
close reading and comments on early drafts of this paper.
Colleen McHugh aided with the analysis of data volumes.</p><p id="d1e2135">The research leading to these results received funding from the
European Union Seventh Framework program under the IS-ENES2 project
(grant agreement no. 312979).</p><p id="d1e2137">Venkatramani Balaji is supported by the Cooperative Institute for Climate
Science, Princeton University, award NA08OAR4320752 from the
National Oceanic and Atmospheric Administration, U.S. Department of
Commerce. The statements, findings, conclusions, and recommendations
are those of the authors and do not necessarily reflect the views of
Princeton University, the National Oceanic and Atmospheric
Administration, or the U.S. Department of Commerce.</p><p id="d1e2139">Bryan N. Lawrence acknowledges additional support from the UK Natural
Environment Research Council.</p><p id="d1e2141">Karl E. Taylor and Paul J. Durack are supported by the Regional and Global
Model Analysis Program of the United States Department of Energy's
Office of Science, and their work was performed under the auspices
of Lawrence Livermore National Laboratory's contract
DE-AC52-07NA27344.<?xmltex \hack{\newline}?><?xmltex \hack{\newline}?>
Edited by: Steve Easterbrook<?xmltex \hack{\newline}?>
Reviewed by: three anonymous referees</p></ack><ref-list>
    <title>References</title>

      <ref id="bib1.bibx1"><label>Aad et al.(2008)</label><mixed-citation>Aad, G., Butterworth, J., Thion, J., et al.: The ATLAS
experiment at the CERN large hadron collider, Jinst, 3, S08003, <ext-link xlink:href="https://doi.org/10.1088/1748-0221/3/08/S08003" ext-link-type="DOI">10.1088/1748-0221/3/08/S08003</ext-link>, 2008.</mixed-citation></ref>
      <ref id="bib1.bibx2"><label>Andrews et al.(2012)</label><mixed-citation>Andrews, T., Gregory, J. M., Webb, M. J., and Taylor, K. E.: Forcing,
feedbacks and climate sensitivity in CMIP5 coupled atmosphere-ocean climate
models, Geophys. Res. Lett., 39, L09712, <ext-link xlink:href="https://doi.org/10.1029/2012GL051607" ext-link-type="DOI">10.1029/2012GL051607</ext-link>, 2012.</mixed-citation></ref>
      <ref id="bib1.bibx3"><label>Baker et al.(2016)</label><mixed-citation>Baker, A. H., Hammerling, D. M., Mickelson, S. A., Xu, H., Stolpe, M. B.,
Naveau, P., Sanderson, B., Ebert-Uphoff, I., Samarasinghe, S., De Simone, F.,
Carbone, F., Gencarelli, C. N., Dennis, J. M., Kay, J. E., and Lindstrom, P.:
Evaluating lossy data compression on climate simulation data within a large
ensemble, Geosci. Model Dev., 9, 4381–4403,
<ext-link xlink:href="https://doi.org/10.5194/gmd-9-4381-2016" ext-link-type="DOI">10.5194/gmd-9-4381-2016</ext-link>, 2016.</mixed-citation></ref>
      <ref id="bib1.bibx4"><label>Balaji et al.(2011)</label><mixed-citation>
Balaji, V., Ansari, S., and Radhakrishnan, A.: Deploying user-developed
scientific analyses on federated data archives, in: AGU Fall Meeting
Abstracts, vol. 1, p. 1, 2011.</mixed-citation></ref>
      <ref id="bib1.bibx5"><label>Balaji et al.(2017)</label><mixed-citation>Balaji, V., Maisonnave, E., Zadeh, N., Lawrence, B. N., Biercamp, J.,
Fladrich, U., Aloisio, G., Benson, R., Caubel, A., Durachta, J., Foujols,
M.-A., Lister, G., Mocavero, S., Underwood, S., and Wright, G.: CPMIP:
measurements of real computational performance of Earth system models in
CMIP6, Geosci. Model Dev., 10, 19–34,
<ext-link xlink:href="https://doi.org/10.5194/gmd-10-19-2017" ext-link-type="DOI">10.5194/gmd-10-19-2017</ext-link>, 2017.</mixed-citation></ref>
      <ref id="bib1.bibx6"><label>Bony et al.(2013)</label><mixed-citation>
Bony, S., Stevens, B., Held, I. H., Mitchell, J. F., Dufresne, J.-L.,
Emanuel, K. A., Friedlingstein, P., Griffies, S., and Senior, C.: Carbon
dioxide and climate: perspectives on a scientific assessment, in: Climate
Science for Serving Society, Springer, 391–413, 2013.</mixed-citation></ref>
      <ref id="bib1.bibx7"><label>Chard et al.(2015)</label><mixed-citation>Chard, K., Pruyne, J., Blaiszik, B., Ananthakrishnan, R., Tuecke, S., and
Foster, I.: Globus data publication as a service: Lowering barriers to
reproducible science, 2015 IEEE 11th International Conference on e-Science,
401–410, <ext-link xlink:href="https://doi.org/10.1109/eScience.2015.68" ext-link-type="DOI">10.1109/eScience.2015.68</ext-link>, 2015.</mixed-citation></ref>
      <ref id="bib1.bibx8"><label>Charney et al.(1979)</label><mixed-citation>Charney, J. G., Arakawa, A., Baker, D. J., Bolin, B., Dickinson, R. E., Goody,
R. M., Leith, C. E., Stommel, H. M., and Wunsch, C. I.: Carbon dioxide and
climate: a scientific assessment, National Research Council, <ext-link xlink:href="https://doi.org/10.17226/12181" ext-link-type="DOI">10.17226/12181</ext-link>, 1979.</mixed-citation></ref>
      <ref id="bib1.bibx9"><label>Collins and Tabak(2014)</label><mixed-citation>
Collins, F. S. and Tabak, L. A.: NIH plans to enhance reproducibility,
Nature, 505, 612–613, 2014.</mixed-citation></ref>
      <ref id="bib1.bibx10"><label>David et al.(2016)</label><mixed-citation>David, C. H., Gil, Y., Duffy, C. J., Peckham, S. D., and Venayagamoorthy,
S. K.: An introduction to the special issue on Geoscience Papers of the
Future, Earth Space Sci., 3, 441–444, <ext-link xlink:href="https://doi.org/10.1002/2016EA000201" ext-link-type="DOI">10.1002/2016EA000201</ext-link>, 2016.</mixed-citation></ref>
      <ref id="bib1.bibx11"><label>Duffy et al.(2015)</label><mixed-citation>
Duffy, D., Maxwell, T., Doutriaux, C., Williams, D., Chaudhary, A., and Ames,
S.: Integration and Exposure of Large Scale Computational Resources Across
the Earth System Grid Federation (ESGF), in: AGU Fall Meeting Abstracts,
IN31A-1748, 2015.</mixed-citation></ref>
      <ref id="bib1.bibx12"><label>Durack et al.(2018)</label><mixed-citation>Durack, P. J., Taylor, K. E., Eyring, V., Ames, S. K., Hoang, T., Nadeau, D.,
Doutriaux, C., Stockhause, M., and Gleckler, P. J.: Toward Standardized Data
Sets for Climate Model Experimentation, Eos, 99, <ext-link xlink:href="https://doi.org/10.1029/2018EO101751" ext-link-type="DOI">10.1029/2018EO101751</ext-link>,
2018.</mixed-citation></ref>
      <ref id="bib1.bibx13"><label>Edwards(2010)</label><mixed-citation>
Edwards, P.: A vast machine: computer models, climate data, and the politics
of global warming, The MIT Press, 518 pp., ISBN:0262518635, 2010.</mixed-citation></ref>
      <ref id="bib1.bibx14"><label>Eyring et al.(2016a)</label><mixed-citation>Eyring, V., Bony, S., Meehl, G. A., Senior, C. A., Stevens, B., Stouffer, R.
J., and Taylor, K. E.: Overview of the Coupled Model Intercomparison Project
Phase 6 (CMIP6) experimental design and organization, Geosci. Model Dev., 9,
1937–1958, <ext-link xlink:href="https://doi.org/10.5194/gmd-9-1937-2016" ext-link-type="DOI">10.5194/gmd-9-1937-2016</ext-link>, 2016a.</mixed-citation></ref>
      <ref id="bib1.bibx15"><label>Eyring et al.(2016b)</label><mixed-citation>Eyring, V., Gleckler, P. J., Heinze, C., Stouffer, R. J., Taylor, K. E.,
Balaji, V., Guilyardi, E., Joussaume, S., Kindermann, S., Lawrence, B. N.,
Meehl, G. A., Righi, M., and Williams, D. N.: Towards improved and more
routine Earth system model evaluation in CMIP, Earth Syst. Dynam., 7,
813–830, <ext-link xlink:href="https://doi.org/10.5194/esd-7-813-2016" ext-link-type="DOI">10.5194/esd-7-813-2016</ext-link>, 2016b.</mixed-citation></ref>
      <ref id="bib1.bibx16"><label>Ferraro et al.(2015)</label><mixed-citation>Ferraro, R., Waliser, D. E., Gleckler, P., Taylor, K. E., and Eyring, V.:
Evolving Obs4MIPs to Support Phase 6 of the Coupled Model Intercomparison
Project (CMIP6), B. Am. Meteorol. Soc., 96, ES131–ES133,
<ext-link xlink:href="https://doi.org/10.1175/BAMS-D-14-00216.1" ext-link-type="DOI">10.1175/BAMS-D-14-00216.1</ext-link>, 2015.</mixed-citation></ref>
      <ref id="bib1.bibx17"><label>Gates(1992)</label><mixed-citation>
Gates, W. L.: AMIP: The Atmospheric Model Intercomparison Project, B. Am.
Meteorol. Soc., 73, 1962–1970, 1992.</mixed-citation></ref>
      <ref id="bib1.bibx18"><label>Gleckler et al.(2016)</label><mixed-citation>Gleckler, P., Doutriaux, C., Durack, P., Taylor, K., Zhang, Y., Williams, D.,
Mason, E., and Servonnat, J.: A more powerful reality test for climate
models, Eos Trans. AGU, 97, <ext-link xlink:href="https://doi.org/10.1029/2016EO051663" ext-link-type="DOI">10.1029/2016EO051663</ext-link>, 2016.</mixed-citation></ref>
      <ref id="bib1.bibx19"><label>Griffies et al.(2014)</label><mixed-citation>
Griffies, S. M., Adcroft, A. J., Balaji, V., Danabasoglu, G., Durack, P. J.,
Gleckler, P. J., Gregory, J. M., Krasting, J. P., McDougall, T. J., Stouffer,
R. J., Gregory, J., Hallberg, R. W., Legg, S., Martin, T., McDougall, T.,
Pirani, A., Schmidt, G., Stevens, D., Taylor,<?pagebreak page3680?> K. E., and Tsujino, H.:
Sampling the Physical Ocean in CMIP6 Simulations, CLIVAR Report, 2014.</mixed-citation></ref>
      <ref id="bib1.bibx20"><label>Griffies et al.(2016)</label><mixed-citation>Griffies, S. M., Danabasoglu, G., Durack, P. J., Adcroft, A. J., Balaji, V.,
Böning, C. W., Chassignet, E. P., Curchitser, E., Deshayes, J., Drange,
H., Fox-Kemper, B., Gleckler, P. J., Gregory, J. M., Haak, H., Hallberg, R.
W., Heimbach, P., Hewitt, H. T., Holland, D. M., Ilyina, T., Jungclaus, J.
H., Komuro, Y., Krasting, J. P., Large, W. G., Marsland, S. J., Masina, S.,
McDougall, T. J., Nurser, A. J. G., Orr, J. C., Pirani, A., Qiao, F.,
Stouffer, R. J., Taylor, K. E., Treguier, A. M., Tsujino, H., Uotila, P.,
Valdivieso, M., Wang, Q., Winton, M., and Yeager, S. G.: OMIP contribution to
CMIP6: experimental and diagnostic protocol for the physical component of the
Ocean Model Intercomparison Project, Geosci. Model Dev., 9, 3231–3296,
<ext-link xlink:href="https://doi.org/10.5194/gmd-9-3231-2016" ext-link-type="DOI">10.5194/gmd-9-3231-2016</ext-link>, 2016.</mixed-citation></ref>
      <ref id="bib1.bibx21"><label>Guilyardi et al.(2013)</label><mixed-citation>Guilyardi, E., Balaji, V., Lawrence, B., Callaghan, S., Deluca, C., Denvil,
S., Lautenschlager, M., Morgan, M., Murphy, S., and Taylor, K. E.:
Documenting Climate Models and Their Simulations, B. Am. Meteorol. Soc., 94,
623–627, <ext-link xlink:href="https://doi.org/10.1175/BAMS-D-11-00035.1" ext-link-type="DOI">10.1175/BAMS-D-11-00035.1</ext-link>, 2013.</mixed-citation></ref>
      <ref id="bib1.bibx22"><label>Haarsma et al.(2016)</label><mixed-citation>Haarsma, R. J., Roberts, M. J., Vidale, P. L., Senior, C. A., Bellucci, A.,
Bao, Q., Chang, P., Corti, S., Fuckar, N. S., Guemas, V., von Hardenberg, J.,
Hazeleger, W., Kodama, C., Koenigk, T., Leung, L. R., Lu, J., Luo, J.-J.,
Mao, J., Mizielinski, M. S., Mizuta, R., Nobre, P., Satoh, M., Scoccimarro,
E., Semmler, T., Small, J., and von Storch, J.-S.: High Resolution Model
Intercomparison Project (HighResMIP v1.0) for CMIP6, Geosci. Model Dev., 9,
4185–4208, <ext-link xlink:href="https://doi.org/10.5194/gmd-9-4185-2016" ext-link-type="DOI">10.5194/gmd-9-4185-2016</ext-link>, 2016.</mixed-citation></ref>
      <ref id="bib1.bibx23"><label>Hansen et al.(1981)</label><mixed-citation>Hansen, J., Johnson, D., Lacis, A., Lebedeff, S., Lee, P., Rind, D., and
Russell, G.: Climate Impact of Increasing Atmospheric Carbon Dioxide,
Science, 213, 957–966, <ext-link xlink:href="https://doi.org/10.1126/science.213.4511.957" ext-link-type="DOI">10.1126/science.213.4511.957</ext-link>, 1981.</mixed-citation></ref>
      <ref id="bib1.bibx24"><label>Hoskins(2013)</label><mixed-citation>
Hoskins, B.: The potential for skill across the range of the seamless
weather-climate prediction problem: a stimulus for our science, Q. J. Roy.
Meteor. Soc., 139, 573–584, 2013.</mixed-citation></ref>
      <ref id="bib1.bibx25"><label>Houghton et al.(1992)</label><mixed-citation>
Houghton, J. T., Callander, B. A., and Varney, S. K.: Climate change 1992,
Cambridge University Press, 1992.</mixed-citation></ref>
      <ref id="bib1.bibx26"><label>Juckes et al.(2015)</label><mixed-citation>
Juckes, M., Eyring, V., Taylor, K., Balaji, V., and Stouffer, R.: The CMIP6
Data Request: the next generation climate archive, in: EGU General Assembly
Conference Abstracts, 17, 13112, 2015.</mixed-citation></ref>
      <ref id="bib1.bibx27"><label>Knutti(2010)</label><mixed-citation>
Knutti, R.: The end of model democracy?, Clim. Change, 102, 395–404, 2010.</mixed-citation></ref>
      <ref id="bib1.bibx28"><label>Knutti et al.(2017)</label><mixed-citation>
Knutti, R., Sedláček, J., Sanderson, B. M., Lorenz, R., Fischer,
E. M., and Eyring, V.: A climate model projection weighting scheme accounting
for performance and interdependence, Geophys. Res. Lett., 44, 1909–1918,
2017.</mixed-citation></ref>
      <ref id="bib1.bibx29"><label>Lake et al.(2017)</label><mixed-citation>
Lake, I., Gutowski, W., Giorgi, F., and Lee, B.: CORDEX: Climate Research and
Information for Regions, B. Am. Meteorol. Soc., 98, ES189–ES192, 2017.</mixed-citation></ref>
      <ref id="bib1.bibx30"><label>Lawrence et al.(2012)</label><mixed-citation>Lawrence, B. N., Balaji, V., Bentley, P., Callaghan, S., DeLuca, C., Denvil,
S., Devine, G., Elkington, M., Ford, R. W., Guilyardi, E., Lautenschlager,
M., Morgan, M., Moine, M.-P., Murphy, S., Pascoe, C., Ramthun, H., Slavin,
P., Steenman-Clark, L., Toussaint, F., Treshansky, A., and Valcke, S.:
Describing Earth system simulations with the Metafor CIM, Geosci. Model Dev.,
5, 1493–1500, <ext-link xlink:href="https://doi.org/10.5194/gmd-5-1493-2012" ext-link-type="DOI">10.5194/gmd-5-1493-2012</ext-link>, 2012.</mixed-citation></ref><?xmltex \hack{\newpage}?>
      <ref id="bib1.bibx31"><label>Lawrence et al.(2013)</label><mixed-citation>
Lawrence, B. N., Bennett, V. L., Churchill, J., Juckes, M., Kershaw, P.,
Pascoe, S., Pepler, S., Pritchard, M., and Stephens, A.: Storing and
manipulating environmental big data with JASMIN, 2013 IEEE International
Conference on Big Data, 68–75, 2013.</mixed-citation></ref>
      <ref id="bib1.bibx32"><label>Manabe and Wetherald(1975)</label><mixed-citation>Manabe, S. and Wetherald, R. T.: The Effects of Doubling the CO<inline-formula><mml:math id="M19" display="inline"><mml:msub><mml:mi/><mml:mn mathvariant="normal">2</mml:mn></mml:msub></mml:math></inline-formula>
Concentration on the climate of a General Circulation Model, J. Atmos. Sci.,
32, 3–15, 1975.</mixed-citation></ref>
      <ref id="bib1.bibx33"><label>Moss et al.(2010)</label><mixed-citation>
Moss, R. H., Edmonds, J. A., Hibbard, K. A., Manning, M. R., Rose, S. K., Van
Vuuren, D. P., Carter, T. R., Emori, S., Kainuma, M., Kram, T., Meehl, G. A.,
Mitchell, J. F. B., Nakicenovic, N., Riahi, K., Smith, S. J., Stouffer, R.
J., Thomson, A. M., Weyant, J. P., and Wilbanks, T. J.: The next generation
of scenarios for climate change research and assessment, Nature, 463,
747–756, 2010.</mixed-citation></ref>
      <ref id="bib1.bibx34"><label>NASEM(2012)</label><mixed-citation>NASEM: A National Strategy for Advancing Climate Modeling, The National
Academies Press, Washington, DC, <ext-link xlink:href="https://doi.org/10.17226/13430" ext-link-type="DOI">10.17226/13430</ext-link>, 2012.</mixed-citation></ref>
      <ref id="bib1.bibx35"><label>Overpeck et al.(2011)</label><mixed-citation>Overpeck, J., Meehl, G., Bony, S., and Easterling, D.: Climate data
challenges in the 21st century, Science, 331, 700–702,
<ext-link xlink:href="https://doi.org/10.1126/science.1197869" ext-link-type="DOI">10.1126/science.1197869</ext-link> 2011.</mixed-citation></ref>
      <ref id="bib1.bibx36"><label>Peng(2011)</label><mixed-citation>Peng, R. D.: Reproducible Research in Computational Science, Science, 334,
1226–1227, <ext-link xlink:href="https://doi.org/10.1126/science.1213847" ext-link-type="DOI">10.1126/science.1213847</ext-link>, 2011.</mixed-citation></ref>
      <ref id="bib1.bibx37"><label>Schnase et al.(2017)</label><mixed-citation>
Schnase, J. L., Duffy, D. Q., Tamkin, G. S., Nadeau, D., Thompson, J. H.,
Grieg, C. M., McInerney, M. A., and Webster, W. P.: MERRA analytic services:
Meeting the big data challenges of climate science through cloud-enabled
climate analytics-as-a-service, Comput. Environ. Urban, 61, 198–211, 2017.</mixed-citation></ref>
      <ref id="bib1.bibx38"><label>Stocker et al.(2013)</label><mixed-citation>
Stocker, T. F., Qin, D., Plattner, G.-K., Tignor, M., Allen, S. K., Boschung,
J., Nauels, A., Xia, Y., Bex, V., and Midgley, P. M. (Eds.): Climate Change
2013: The Physical Science Basis, Contribution of Working Group I to the
Fifth Assessment Report of the Intergovernmental Panel on Climate Change,
Cambridge University Press, Cambridge, UK, New York, NY, USA, 2013.</mixed-citation></ref>
      <ref id="bib1.bibx39"><label>Stockhause and
Lautenschlager(2017)</label><mixed-citation>Stockhause, M. and
Lautenschlager, M.: CMIP6 Data Citation of Evolving Data, Data Science
Journal, 16, 1–13, <ext-link xlink:href="https://doi.org/10.5334/dsj-2017-030" ext-link-type="DOI">10.5334/dsj-2017-030</ext-link>, 2017.</mixed-citation></ref>
      <ref id="bib1.bibx40"><label>Teixeira et al.(2014)</label><mixed-citation>
Teixeira, J., Waliser, D., Ferraro, R., Gleckler, P., Lee, T., and Potter,
G.: Satellite observations for CMIP5: The genesis of Obs4MIPs, B. Am.
Meteorol. Soc., 95, 1329–1334, 2014.</mixed-citation></ref>
      <ref id="bib1.bibx41"><label>Williams et al.(2011)</label><mixed-citation>
Williams, D. N., Taylor, K. E., Cinquini, L., Evans, B., Kawamiya, M.,
Lautenschlager, M., Lawrence, B., Middleton, D., and ESGF Contributors: The
Earth System Grid Federation: Software framework supporting CMIP5 data
analysis and dissemination, CLIVAR Exchanges, 56, 40–42, 2011.</mixed-citation></ref>
      <ref id="bib1.bibx42"><label>Williams et al.(2015)</label><mixed-citation>Williams, D. N., Balaji, V., Cinquini, L., Denvil, S., Duffy, D., Evans, B.,
Ferraro, R., Hansen, R., Lautenschlager, M., and Trenham, C.: A Global
Repository for Planet-Sized Experiments and Observations, B. Am. Meteorol.
Soc., 97, 803–816, <ext-link xlink:href="https://doi.org/10.1175/BAMS-D-15-00132.1" ext-link-type="DOI">10.1175/BAMS-D-15-00132.1</ext-link>, 2015.</mixed-citation></ref>
      <ref id="bib1.bibx43"><label>Zhou et al.(2018)</label><mixed-citation>
Zhou, G., Weigel, T., and Plale, B.: Persistent Identifier Kernel Information
for Machine Discovery, in: Joint Conference on Digital Libraries, 2018.</mixed-citation></ref>
      <ref id="bib1.bibx44"><label>Ziv and Lempel(1977)</label><mixed-citation>
Ziv, J. and Lempel, A.: A universal algorithm for sequential data
compression, IEEE T. Inform. Theory, 23, 337–343, 1977.</mixed-citation></ref>

  </ref-list></back>
    <!--<article-title-html>Requirements for a global data infrastructure in support of CMIP6</article-title-html>
<abstract-html><p>The World Climate Research Programme (WCRP)'s Working Group on Climate
Modelling (WGCM) Infrastructure Panel (WIP) was formed in 2014 in response to
the explosive growth in size and complexity of Coupled Model Intercomparison
Projects (CMIPs) between CMIP3 (2005–2006) and CMIP5 (2011–2012). This
article presents the WIP recommendations for the global data infrastructure
needed to support CMIP design, future growth, and evolution. Developed in
close coordination with those who build and run the existing infrastructure
(the Earth System Grid Federation; ESGF), the recommendations are based on
several principles beginning with the need to separate requirements,
implementation, and operations. Other important principles include the
consideration of the diversity of community needs around data – a
data ecosystem – the importance of provenance, the need for
automation, and the obligation to measure costs and benefits.</p><p>This paper concentrates on requirements, recognizing the diversity
of communities involved (modelers, analysts, software developers,
and downstream users). Such requirements include the need for
scientific reproducibility and accountability alongside the need to
record and track data usage. One key element is to generate a
dataset-centric rather than system-centric focus, with an aim to
making the infrastructure less prone to systemic failure.</p><p>With these overarching principles and requirements, the WIP has
produced a set of position papers, which are summarized in the
latter pages of this document. They provide specifications for
managing and delivering model output, including strategies for
replication and versioning, licensing, data quality assurance,
citation, long-term archiving, and dataset tracking. They also
describe a new and more formal approach for specifying what data,
and associated metadata, should be saved, which enables future data
volumes to be estimated, particularly for well-defined projects such
as CMIP6.</p><p>The paper concludes with a future facing consideration of the global
data infrastructure evolution that follows from the blurring of
boundaries between climate and weather, and the changing nature of
published scientific results in the digital age.</p></abstract-html>
<ref-html id="bib1.bib1"><label>Aad et al.(2008)</label><mixed-citation>
Aad, G., Butterworth, J., Thion, J., et al.: The ATLAS
experiment at the CERN large hadron collider, Jinst, 3, S08003, <a href="https://doi.org/10.1088/1748-0221/3/08/S08003" target="_blank">https://doi.org/10.1088/1748-0221/3/08/S08003</a>, 2008.
</mixed-citation></ref-html>
<ref-html id="bib1.bib2"><label>Andrews et al.(2012)</label><mixed-citation>
Andrews, T., Gregory, J. M., Webb, M. J., and Taylor, K. E.: Forcing,
feedbacks and climate sensitivity in CMIP5 coupled atmosphere-ocean climate
models, Geophys. Res. Lett., 39, L09712, <a href="https://doi.org/10.1029/2012GL051607" target="_blank">https://doi.org/10.1029/2012GL051607</a>, 2012.
</mixed-citation></ref-html>
<ref-html id="bib1.bib3"><label>Baker et al.(2016)</label><mixed-citation>
Baker, A. H., Hammerling, D. M., Mickelson, S. A., Xu, H., Stolpe, M. B.,
Naveau, P., Sanderson, B., Ebert-Uphoff, I., Samarasinghe, S., De Simone, F.,
Carbone, F., Gencarelli, C. N., Dennis, J. M., Kay, J. E., and Lindstrom, P.:
Evaluating lossy data compression on climate simulation data within a large
ensemble, Geosci. Model Dev., 9, 4381–4403,
<a href="https://doi.org/10.5194/gmd-9-4381-2016" target="_blank">https://doi.org/10.5194/gmd-9-4381-2016</a>, 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib4"><label>Balaji et al.(2011)</label><mixed-citation>
Balaji, V., Ansari, S., and Radhakrishnan, A.: Deploying user-developed
scientific analyses on federated data archives, in: AGU Fall Meeting
Abstracts, vol. 1, p. 1, 2011.
</mixed-citation></ref-html>
<ref-html id="bib1.bib5"><label>Balaji et al.(2017)</label><mixed-citation>
Balaji, V., Maisonnave, E., Zadeh, N., Lawrence, B. N., Biercamp, J.,
Fladrich, U., Aloisio, G., Benson, R., Caubel, A., Durachta, J., Foujols,
M.-A., Lister, G., Mocavero, S., Underwood, S., and Wright, G.: CPMIP:
measurements of real computational performance of Earth system models in
CMIP6, Geosci. Model Dev., 10, 19–34,
<a href="https://doi.org/10.5194/gmd-10-19-2017" target="_blank">https://doi.org/10.5194/gmd-10-19-2017</a>, 2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib6"><label>Bony et al.(2013)</label><mixed-citation>
Bony, S., Stevens, B., Held, I. H., Mitchell, J. F., Dufresne, J.-L.,
Emanuel, K. A., Friedlingstein, P., Griffies, S., and Senior, C.: Carbon
dioxide and climate: perspectives on a scientific assessment, in: Climate
Science for Serving Society, Springer, 391–413, 2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib7"><label>Chard et al.(2015)</label><mixed-citation>
Chard, K., Pruyne, J., Blaiszik, B., Ananthakrishnan, R., Tuecke, S., and
Foster, I.: Globus data publication as a service: Lowering barriers to
reproducible science, 2015 IEEE 11th International Conference on e-Science,
401–410, <a href="https://doi.org/10.1109/eScience.2015.68" target="_blank">https://doi.org/10.1109/eScience.2015.68</a>, 2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib8"><label>Charney et al.(1979)</label><mixed-citation>
Charney, J. G., Arakawa, A., Baker, D. J., Bolin, B., Dickinson, R. E., Goody,
R. M., Leith, C. E., Stommel, H. M., and Wunsch, C. I.: Carbon dioxide and
climate: a scientific assessment, National Research Council, <a href="https://doi.org/10.17226/12181" target="_blank">https://doi.org/10.17226/12181</a>, 1979.
</mixed-citation></ref-html>
<ref-html id="bib1.bib9"><label>Collins and Tabak(2014)</label><mixed-citation>
Collins, F. S. and Tabak, L. A.: NIH plans to enhance reproducibility,
Nature, 505, 612–613, 2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib10"><label>David et al.(2016)</label><mixed-citation>
David, C. H., Gil, Y., Duffy, C. J., Peckham, S. D., and Venayagamoorthy,
S. K.: An introduction to the special issue on Geoscience Papers of the
Future, Earth Space Sci., 3, 441–444, <a href="https://doi.org/10.1002/2016EA000201" target="_blank">https://doi.org/10.1002/2016EA000201</a>, 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib11"><label>Duffy et al.(2015)</label><mixed-citation>
Duffy, D., Maxwell, T., Doutriaux, C., Williams, D., Chaudhary, A., and Ames,
S.: Integration and Exposure of Large Scale Computational Resources Across
the Earth System Grid Federation (ESGF), in: AGU Fall Meeting Abstracts,
IN31A-1748, 2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib12"><label>Durack et al.(2018)</label><mixed-citation>
Durack, P. J., Taylor, K. E., Eyring, V., Ames, S. K., Hoang, T., Nadeau, D.,
Doutriaux, C., Stockhause, M., and Gleckler, P. J.: Toward Standardized Data
Sets for Climate Model Experimentation, Eos, 99, <a href="https://doi.org/10.1029/2018EO101751" target="_blank">https://doi.org/10.1029/2018EO101751</a>,
2018.
</mixed-citation></ref-html>
<ref-html id="bib1.bib13"><label>Edwards(2010)</label><mixed-citation>
Edwards, P.: A vast machine: computer models, climate data, and the politics
of global warming, The MIT Press, 518 pp., ISBN:0262518635, 2010.
</mixed-citation></ref-html>
<ref-html id="bib1.bib14"><label>Eyring et al.(2016a)</label><mixed-citation>
Eyring, V., Bony, S., Meehl, G. A., Senior, C. A., Stevens, B., Stouffer, R.
J., and Taylor, K. E.: Overview of the Coupled Model Intercomparison Project
Phase 6 (CMIP6) experimental design and organization, Geosci. Model Dev., 9,
1937–1958, <a href="https://doi.org/10.5194/gmd-9-1937-2016" target="_blank">https://doi.org/10.5194/gmd-9-1937-2016</a>, 2016a.
</mixed-citation></ref-html>
<ref-html id="bib1.bib15"><label>Eyring et al.(2016b)</label><mixed-citation>
Eyring, V., Gleckler, P. J., Heinze, C., Stouffer, R. J., Taylor, K. E.,
Balaji, V., Guilyardi, E., Joussaume, S., Kindermann, S., Lawrence, B. N.,
Meehl, G. A., Righi, M., and Williams, D. N.: Towards improved and more
routine Earth system model evaluation in CMIP, Earth Syst. Dynam., 7,
813–830, <a href="https://doi.org/10.5194/esd-7-813-2016" target="_blank">https://doi.org/10.5194/esd-7-813-2016</a>, 2016b.
</mixed-citation></ref-html>
<ref-html id="bib1.bib16"><label>Ferraro et al.(2015)</label><mixed-citation>
Ferraro, R., Waliser, D. E., Gleckler, P., Taylor, K. E., and Eyring, V.:
Evolving Obs4MIPs to Support Phase 6 of the Coupled Model Intercomparison
Project (CMIP6), B. Am. Meteorol. Soc., 96, ES131–ES133,
<a href="https://doi.org/10.1175/BAMS-D-14-00216.1" target="_blank">https://doi.org/10.1175/BAMS-D-14-00216.1</a>, 2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib17"><label>Gates(1992)</label><mixed-citation>
Gates, W. L.: AMIP: The Atmospheric Model Intercomparison Project, B. Am.
Meteorol. Soc., 73, 1962–1970, 1992.
</mixed-citation></ref-html>
<ref-html id="bib1.bib18"><label>Gleckler et al.(2016)</label><mixed-citation>
Gleckler, P., Doutriaux, C., Durack, P., Taylor, K., Zhang, Y., Williams, D.,
Mason, E., and Servonnat, J.: A more powerful reality test for climate
models, Eos Trans. AGU, 97, <a href="https://doi.org/10.1029/2016EO051663" target="_blank">https://doi.org/10.1029/2016EO051663</a>, 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib19"><label>Griffies et al.(2014)</label><mixed-citation>
Griffies, S. M., Adcroft, A. J., Balaji, V., Danabasoglu, G., Durack, P. J.,
Gleckler, P. J., Gregory, J. M., Krasting, J. P., McDougall, T. J., Stouffer,
R. J., Gregory, J., Hallberg, R. W., Legg, S., Martin, T., McDougall, T.,
Pirani, A., Schmidt, G., Stevens, D., Taylor, K. E., and Tsujino, H.:
Sampling the Physical Ocean in CMIP6 Simulations, CLIVAR Report, 2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib20"><label>Griffies et al.(2016)</label><mixed-citation>
Griffies, S. M., Danabasoglu, G., Durack, P. J., Adcroft, A. J., Balaji, V.,
Böning, C. W., Chassignet, E. P., Curchitser, E., Deshayes, J., Drange,
H., Fox-Kemper, B., Gleckler, P. J., Gregory, J. M., Haak, H., Hallberg, R.
W., Heimbach, P., Hewitt, H. T., Holland, D. M., Ilyina, T., Jungclaus, J.
H., Komuro, Y., Krasting, J. P., Large, W. G., Marsland, S. J., Masina, S.,
McDougall, T. J., Nurser, A. J. G., Orr, J. C., Pirani, A., Qiao, F.,
Stouffer, R. J., Taylor, K. E., Treguier, A. M., Tsujino, H., Uotila, P.,
Valdivieso, M., Wang, Q., Winton, M., and Yeager, S. G.: OMIP contribution to
CMIP6: experimental and diagnostic protocol for the physical component of the
Ocean Model Intercomparison Project, Geosci. Model Dev., 9, 3231–3296,
<a href="https://doi.org/10.5194/gmd-9-3231-2016" target="_blank">https://doi.org/10.5194/gmd-9-3231-2016</a>, 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib21"><label>Guilyardi et al.(2013)</label><mixed-citation>
Guilyardi, E., Balaji, V., Lawrence, B., Callaghan, S., Deluca, C., Denvil,
S., Lautenschlager, M., Morgan, M., Murphy, S., and Taylor, K. E.:
Documenting Climate Models and Their Simulations, B. Am. Meteorol. Soc., 94,
623–627, <a href="https://doi.org/10.1175/BAMS-D-11-00035.1" target="_blank">https://doi.org/10.1175/BAMS-D-11-00035.1</a>, 2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib22"><label>Haarsma et al.(2016)</label><mixed-citation>
Haarsma, R. J., Roberts, M. J., Vidale, P. L., Senior, C. A., Bellucci, A.,
Bao, Q., Chang, P., Corti, S., Fuckar, N. S., Guemas, V., von Hardenberg, J.,
Hazeleger, W., Kodama, C., Koenigk, T., Leung, L. R., Lu, J., Luo, J.-J.,
Mao, J., Mizielinski, M. S., Mizuta, R., Nobre, P., Satoh, M., Scoccimarro,
E., Semmler, T., Small, J., and von Storch, J.-S.: High Resolution Model
Intercomparison Project (HighResMIP v1.0) for CMIP6, Geosci. Model Dev., 9,
4185–4208, <a href="https://doi.org/10.5194/gmd-9-4185-2016" target="_blank">https://doi.org/10.5194/gmd-9-4185-2016</a>, 2016.
</mixed-citation></ref-html>
<ref-html id="bib1.bib23"><label>Hansen et al.(1981)</label><mixed-citation>
Hansen, J., Johnson, D., Lacis, A., Lebedeff, S., Lee, P., Rind, D., and
Russell, G.: Climate Impact of Increasing Atmospheric Carbon Dioxide,
Science, 213, 957–966, <a href="https://doi.org/10.1126/science.213.4511.957" target="_blank">https://doi.org/10.1126/science.213.4511.957</a>, 1981.
</mixed-citation></ref-html>
<ref-html id="bib1.bib24"><label>Hoskins(2013)</label><mixed-citation>
Hoskins, B.: The potential for skill across the range of the seamless
weather-climate prediction problem: a stimulus for our science, Q. J. Roy.
Meteor. Soc., 139, 573–584, 2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib25"><label>Houghton et al.(1992)</label><mixed-citation>
Houghton, J. T., Callander, B. A., and Varney, S. K.: Climate change 1992,
Cambridge University Press, 1992.
</mixed-citation></ref-html>
<ref-html id="bib1.bib26"><label>Juckes et al.(2015)</label><mixed-citation>
Juckes, M., Eyring, V., Taylor, K., Balaji, V., and Stouffer, R.: The CMIP6
Data Request: the next generation climate archive, in: EGU General Assembly
Conference Abstracts, 17, 13112, 2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib27"><label>Knutti(2010)</label><mixed-citation>
Knutti, R.: The end of model democracy?, Clim. Change, 102, 395–404, 2010.
</mixed-citation></ref-html>
<ref-html id="bib1.bib28"><label>Knutti et al.(2017)</label><mixed-citation>
Knutti, R., Sedláček, J., Sanderson, B. M., Lorenz, R., Fischer,
E. M., and Eyring, V.: A climate model projection weighting scheme accounting
for performance and interdependence, Geophys. Res. Lett., 44, 1909–1918,
2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib29"><label>Lake et al.(2017)</label><mixed-citation>
Lake, I., Gutowski, W., Giorgi, F., and Lee, B.: CORDEX: Climate Research and
Information for Regions, B. Am. Meteorol. Soc., 98, ES189–ES192, 2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib30"><label>Lawrence et al.(2012)</label><mixed-citation>
Lawrence, B. N., Balaji, V., Bentley, P., Callaghan, S., DeLuca, C., Denvil,
S., Devine, G., Elkington, M., Ford, R. W., Guilyardi, E., Lautenschlager,
M., Morgan, M., Moine, M.-P., Murphy, S., Pascoe, C., Ramthun, H., Slavin,
P., Steenman-Clark, L., Toussaint, F., Treshansky, A., and Valcke, S.:
Describing Earth system simulations with the Metafor CIM, Geosci. Model Dev.,
5, 1493–1500, <a href="https://doi.org/10.5194/gmd-5-1493-2012" target="_blank">https://doi.org/10.5194/gmd-5-1493-2012</a>, 2012.
</mixed-citation></ref-html>
<ref-html id="bib1.bib31"><label>Lawrence et al.(2013)</label><mixed-citation>
Lawrence, B. N., Bennett, V. L., Churchill, J., Juckes, M., Kershaw, P.,
Pascoe, S., Pepler, S., Pritchard, M., and Stephens, A.: Storing and
manipulating environmental big data with JASMIN, 2013 IEEE International
Conference on Big Data, 68–75, 2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib32"><label>Manabe and Wetherald(1975)</label><mixed-citation>
Manabe, S. and Wetherald, R. T.: The Effects of Doubling the CO<sub>2</sub>
Concentration on the climate of a General Circulation Model, J. Atmos. Sci.,
32, 3–15, 1975.
</mixed-citation></ref-html>
<ref-html id="bib1.bib33"><label>Moss et al.(2010)</label><mixed-citation>
Moss, R. H., Edmonds, J. A., Hibbard, K. A., Manning, M. R., Rose, S. K., Van
Vuuren, D. P., Carter, T. R., Emori, S., Kainuma, M., Kram, T., Meehl, G. A.,
Mitchell, J. F. B., Nakicenovic, N., Riahi, K., Smith, S. J., Stouffer, R.
J., Thomson, A. M., Weyant, J. P., and Wilbanks, T. J.: The next generation
of scenarios for climate change research and assessment, Nature, 463,
747–756, 2010.
</mixed-citation></ref-html>
<ref-html id="bib1.bib34"><label>NASEM(2012)</label><mixed-citation>
NASEM: A National Strategy for Advancing Climate Modeling, The National
Academies Press, Washington, DC, <a href="https://doi.org/10.17226/13430" target="_blank">https://doi.org/10.17226/13430</a>, 2012.
</mixed-citation></ref-html>
<ref-html id="bib1.bib35"><label>Overpeck et al.(2011)</label><mixed-citation>
Overpeck, J., Meehl, G., Bony, S., and Easterling, D.: Climate data
challenges in the 21st century, Science, 331, 700–702,
<a href="https://doi.org/10.1126/science.1197869" target="_blank">https://doi.org/10.1126/science.1197869</a> 2011.
</mixed-citation></ref-html>
<ref-html id="bib1.bib36"><label>Peng(2011)</label><mixed-citation>
Peng, R. D.: Reproducible Research in Computational Science, Science, 334,
1226–1227, <a href="https://doi.org/10.1126/science.1213847" target="_blank">https://doi.org/10.1126/science.1213847</a>, 2011.
</mixed-citation></ref-html>
<ref-html id="bib1.bib37"><label>Schnase et al.(2017)</label><mixed-citation>
Schnase, J. L., Duffy, D. Q., Tamkin, G. S., Nadeau, D., Thompson, J. H.,
Grieg, C. M., McInerney, M. A., and Webster, W. P.: MERRA analytic services:
Meeting the big data challenges of climate science through cloud-enabled
climate analytics-as-a-service, Comput. Environ. Urban, 61, 198–211, 2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib38"><label>Stocker et al.(2013)</label><mixed-citation>
Stocker, T. F., Qin, D., Plattner, G.-K., Tignor, M., Allen, S. K., Boschung,
J., Nauels, A., Xia, Y., Bex, V., and Midgley, P. M. (Eds.): Climate Change
2013: The Physical Science Basis, Contribution of Working Group I to the
Fifth Assessment Report of the Intergovernmental Panel on Climate Change,
Cambridge University Press, Cambridge, UK, New York, NY, USA, 2013.
</mixed-citation></ref-html>
<ref-html id="bib1.bib39"><label>Stockhause and
Lautenschlager(2017)</label><mixed-citation> Stockhause, M. and
Lautenschlager, M.: CMIP6 Data Citation of Evolving Data, Data Science
Journal, 16, 1–13, <a href="https://doi.org/10.5334/dsj-2017-030" target="_blank">https://doi.org/10.5334/dsj-2017-030</a>, 2017.
</mixed-citation></ref-html>
<ref-html id="bib1.bib40"><label>Teixeira et al.(2014)</label><mixed-citation>
Teixeira, J., Waliser, D., Ferraro, R., Gleckler, P., Lee, T., and Potter,
G.: Satellite observations for CMIP5: The genesis of Obs4MIPs, B. Am.
Meteorol. Soc., 95, 1329–1334, 2014.
</mixed-citation></ref-html>
<ref-html id="bib1.bib41"><label>Williams et al.(2011)</label><mixed-citation>
Williams, D. N., Taylor, K. E., Cinquini, L., Evans, B., Kawamiya, M.,
Lautenschlager, M., Lawrence, B., Middleton, D., and ESGF Contributors: The
Earth System Grid Federation: Software framework supporting CMIP5 data
analysis and dissemination, CLIVAR Exchanges, 56, 40–42, 2011.
</mixed-citation></ref-html>
<ref-html id="bib1.bib42"><label>Williams et al.(2015)</label><mixed-citation>
Williams, D. N., Balaji, V., Cinquini, L., Denvil, S., Duffy, D., Evans, B.,
Ferraro, R., Hansen, R., Lautenschlager, M., and Trenham, C.: A Global
Repository for Planet-Sized Experiments and Observations, B. Am. Meteorol.
Soc., 97, 803–816, <a href="https://doi.org/10.1175/BAMS-D-15-00132.1" target="_blank">https://doi.org/10.1175/BAMS-D-15-00132.1</a>, 2015.
</mixed-citation></ref-html>
<ref-html id="bib1.bib43"><label>Zhou et al.(2018)</label><mixed-citation>
Zhou, G., Weigel, T., and Plale, B.: Persistent Identifier Kernel Information
for Machine Discovery, in: Joint Conference on Digital Libraries, 2018.
</mixed-citation></ref-html>
<ref-html id="bib1.bib44"><label>Ziv and Lempel(1977)</label><mixed-citation>
Ziv, J. and Lempel, A.: A universal algorithm for sequential data
compression, IEEE T. Inform. Theory, 23, 337–343, 1977.
</mixed-citation></ref-html>--></article>
