CDMS2 writes out data using the NetCDF library
NetCDF4 allows for file compression, a good blog about NetCDF4 and compression can be found here
From this blog:
"The netCDF-4 libraries inherit the capability for data compression from the HDF5 storage layer underneath the netCDF-4 interface. Linking a program that uses netCDF to a netCDF-4 library allows the program to read compressed data without changing a single line of the program source code."
and
"Also, we're only dealing with lossless compression"
This Notebook shows how to control NetCDF4 compression (shuffling/deflating) capabilities via cdms2.
© The CDAT software was developed by LLNL. This tutorial was written by Charles Doutriaux. This work was performed under the auspices of the U.S. Department of Energy by Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344.
from __future__ import print_function
import subprocess
import shlex
import numpy
import os
import io
import time
# Get file size
def size_it(filename):
statinfo = os.stat(filename)
return statinfo.st_size
# Write and return time
def dump(data,filename="example.nc"):
start = time.time()
f = cdms2.open(filename,"w")
f.write(data,id="data")
f.close()
return time.time()-start,size_it(filename)
class HTML(object):
def __init__(self,html):
self.html = html
def _repr_html_(self):
return self.html
# Nice html output for ncdump
class NCINFO(object):
def __init__(self, filename, variable=None, options=""):
self.filename = filename
self.variable = variable
self.options = options
def _repr_html_(self):
out = self.nc_info()
lines = []
for l in out.split("\n"):
for kw in ["chunk","deflate","classic","netcdf4","netcdf-4"]:
if l.lower().find(kw)>-1:
l = "<b>{0}</b>".format(l)
lines.append(l.replace("\t","  "))
return "{0}".format("<br>".join(lines))
def nc_info(self):
"""calls ncdump on file
Can opass a variable or optional ncdump arguments
Default call `ncdump -hs filename`"""
with io.BytesIO() as out:
ncdumpOptions = "-hs {options}".format(options=self.options)
if self.variable is not None:
ncdumpOptions += "-v {variable}".format(self.variable)
cmd = "ncdump {options} {file}".format(options=ncdumpOptions, file=self.filename)
print("Runnning {0}".format(cmd),file=out)
cmd = shlex.split(cmd)
p = subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
o, e = p.communicate()
print("-------",file=out)
print(o,file=out)
print("-------",file=out)
print("File Size {0} bytes".format(size_it(self.filename)),file=out)
return out.getvalue()
import requests
def download(fnm):
r = requests.get("https://uvcdat.llnl.gov/cdat/sample_data/%s" % fnm,stream=True)
with open(fnm,"wb") as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter local_filename keep-alive new chunks
f.write(chunk)
download("clt.nc")
data = numpy.random.random((120,180,360))
# Random data do not compress well at all, switching to 0/1
data = numpy.greater(data,.5).astype(numpy.float)
By default cdms writes out data in NetCDF4 classic with no shuffling and a deflate level of 1
To access the netcdf value used to write data out use the following commands:
import cdms2
print("NetCDF4? ",cdms2.getNetcdf4Flag())
print("NetCDF Classic?",cdms2.getNetcdfClassicFlag())
print("NetCDF4 Shuffling",cdms2.getNetcdfShuffleFlag())
print("NetCDF4 Deflate?",cdms2.getNetcdfDeflateFlag())
print("NetCDF4 Deflate Level?",cdms2.getNetcdfDeflateLevelFlag())
These values are read in at the time you open the file for writing
Note the BOLD lines
dump(data)
NCINFO("example.nc")
value = 0
cdms2.setNetcdfShuffleFlag(value) ## where value is either 0 or 1
cdms2.setNetcdfDeflateFlag(value) ## where value is either 0 or 1
cdms2.setNetcdfDeflateLevelFlag(value) ## where value is a integer between 0 and 9 included
dump(data)
NCINFO("example.nc")
All these options can either be turned to 0 to enable NetCDF3 (as the warning above shows). One can also use the single command:
cdms2.useNetcdf3()
# or for versions earlier than 2.12.2017.10.25
value = 0
cdms2.setNetcdfShuffleFlag(value) ## where value is either 0 or 1
cdms2.setNetcdfDeflateFlag(value) ## where value is either 0 or 1
cdms2.setNetcdfDeflateLevelFlag(value) ## where value is a integer between 0 and 9 included
cdms2.setNetcdf4Flag(0)
dump(data)
NCINFO("example.nc")
cdms2.setNetcdf4Flag(1)
cdms2.setNetcdfClassicFlag(0)
dump(data)
NCINFO("example.nc")
cdms2.setNetcdf4Flag(1)
cdms2.setNetcdfClassicFlag(0)
cdms2.setNetcdfShuffleFlag(1)
dump(data)
NCINFO("example.nc")
cdms2.setNetcdfShuffleFlag(0)
cdms2.setNetcdfDeflateFlag(1)
cdms2.setNetcdfDeflateLevelFlag(5)
dump(data)
NCINFO("example.nc")
f=cdms2.open("clt.nc")
clt = f("clt")
html = "<table border='2'><tr><th>Deflate Level</th><th>NC3</th><th>NC4 Classic no shuffle</th><th>NC4 Classic shuffled</th><th>NC4 no shuffle</th><th>NC4 shuffled</th></tr>"
def addCell():
t,s = dump(clt)
return "<td align='center'>{:.2f}/{:d}</td>".format(t,s)
def nc4s():
out = ""
for classic in [1,0]:
cdms2.setNetcdfClassicFlag(classic)
for shuffle in [0,1]:
cdms2.setNetcdfShuffleFlag(shuffle)
out+=addCell()
out+="</tr>"
return out
# NetCDF3
html+="<tr><td align='center'>0</td>"
cdms2.useNetcdf3()
cdms2.setNetcdf4Flag(0)
html+=addCell()
cdms2.setNetcdf4Flag(1)
html+=nc4s()
cdms2.setNetcdfDeflateFlag(1)
for i in range(1,10):
cdms2.setNetcdfDeflateLevelFlag(i)
html += "<tr><td align='center'>{0}</td><td align='center'>N/A</td>".format(i)
html += nc4s()
html+="<caption>Time To Write NetCDF File and size for various NC4 settings</caption></table>"
HTML(html)