HOME


sh-3ll 1.0
DIR:/usr/local/lib/python3.6/site-packages/xarray/core/
Upload File :
Current File : //usr/local/lib/python3.6/site-packages/xarray/core/dataset.py
import functools
import warnings
from collections import Mapping
from numbers import Number

import numpy as np
import pandas as pd

from . import ops
from . import utils
from . import common
from . import groupby
from . import indexing
from . import alignment
from . import formatting
from .. import conventions
from .alignment import align, align_variables
from .coordinates import DatasetCoordinates, Indexes
from .common import ImplementsDatasetReduce, BaseDataObject
from .merge import merge_datasets, expand_variables
from .utils import Frozen, SortedKeysDict, ChainMap, maybe_wrap_array, hashable
from .variable import (as_variable, Variable, Coordinate, broadcast_variables,
                       default_index_coordinate)
from .pycompat import (iteritems, basestring, OrderedDict,
                       dask_array_type)
from .combine import concat


# list of attributes of pd.DatetimeIndex that are ndarrays of time info
_DATETIMEINDEX_COMPONENTS = ['year', 'month', 'day', 'hour', 'minute',
                             'second', 'microsecond', 'nanosecond', 'date',
                             'time', 'dayofyear', 'weekofyear', 'dayofweek',
                             'quarter']


def _get_virtual_variable(variables, key):
    """Get a virtual variable (e.g., 'time.year') from a dict of
    xarray.Variable objects (if possible)
    """
    if not isinstance(key, basestring):
        raise KeyError(key)

    split_key = key.split('.', 1)
    if len(split_key) != 2:
        raise KeyError(key)

    ref_name, var_name = split_key
    ref_var = variables[ref_name]
    if ref_var.ndim == 1:
        date = ref_var.to_index()
    elif ref_var.ndim == 0:
        date = pd.Timestamp(ref_var.values)
    else:
        raise KeyError(key)

    if var_name == 'season':
        # TODO: move 'season' into pandas itself
        seasons = np.array(['DJF', 'MAM', 'JJA', 'SON'])
        month = date.month
        data = seasons[(month // 3) % 4]
    else:
        data = getattr(date, var_name)
    return ref_name, var_name, Variable(ref_var.dims, data)


def _calculate_dims(variables):
    """Calculate the dimensions corresponding to a set of variables.

    Returns dictionary mapping from dimension names to sizes. Raises ValueError
    if any of the dimension sizes conflict.
    """
    dims = {}
    last_used = {}
    scalar_vars = set(k for k, v in iteritems(variables) if not v.dims)
    for k, var in iteritems(variables):
        for dim, size in zip(var.dims, var.shape):
            if dim in scalar_vars:
                raise ValueError('dimension %s already exists as a scalar '
                                 'variable' % dim)
            if dim not in dims:
                dims[dim] = size
                last_used[dim] = k
            elif dims[dim] != size:
                raise ValueError('conflicting sizes for dimension %r: '
                                 'length %s on %r and length %s on %r' %
                                 (dim, size, k, dims[dim], last_used[dim]))
    return dims


def _assert_empty(args, msg='%s'):
    if args:
        raise ValueError(msg % args)


def as_dataset(obj):
    """Cast the given object to a Dataset.

    Handles Datasets, DataArrays and dictionaries of variables. A new Dataset
    object is only created if the provided object is not already one.
    """
    if hasattr(obj, 'to_dataset'):
        obj = obj.to_dataset()
    if not isinstance(obj, Dataset):
        obj = Dataset(obj)
    return obj


class DataVariables(Mapping):
    def __init__(self, dataset):
        self._dataset = dataset

    def __iter__(self):
        return (key for key in self._dataset._variables
                if key not in self._dataset._coord_names)

    def __len__(self):
        return len(self._dataset._variables) - len(self._dataset._coord_names)

    def __contains__(self, key):
        return (key in self._dataset._variables and
                key not in self._dataset._coord_names)

    def __getitem__(self, key):
        if key not in self._dataset._coord_names:
            return self._dataset[key]
        else:
            raise KeyError(key)

    def __repr__(self):
        return formatting.vars_repr(self)


class _LocIndexer(object):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, key):
        if not utils.is_dict_like(key):
            raise TypeError('can only lookup dictionaries from Dataset.loc')
        return self.dataset.sel(**key)


class Dataset(Mapping, ImplementsDatasetReduce, BaseDataObject):
    """A multi-dimensional, in memory, array database.

    A dataset resembles an in-memory representation of a NetCDF file, and
    consists of variables, coordinates and attributes which together form a
    self describing dataset.

    Dataset implements the mapping interface with keys given by variable names
    and values given by DataArray objects for each variable name.

    One dimensional variables with name equal to their dimension are index
    coordinates used for label based indexing.
    """
    groupby_cls = groupby.DatasetGroupBy

    def __init__(self, data_vars=None, coords=None, attrs=None,
                 compat='broadcast_equals', **kwargs):
        """To load data from a file or file-like object, use the `open_dataset`
        function.

        Parameters
        ----------
        data_vars : dict-like, optional
            A mapping from variable names to :py:class:`~xarray.DataArray`
            objects, :py:class:`~xarray.Variable` objects or tuples of the
            form ``(dims, data[, attrs])`` which can be used as arguments to
            create a new ``Variable``. Each dimension must have the same length
            in all variables in which it appears.
        coords : dict-like, optional
            Another mapping in the same form as the `variables` argument,
            except the each item is saved on the dataset as a "coordinate".
            These variables have an associated meaning: they describe
            constant/fixed/independent quantities, unlike the
            varying/measured/dependent quantities that belong in `variables`.
            Coordinates values may be given by 1-dimensional arrays or scalars,
            in which case `dims` do not need to be supplied: 1D arrays will be
            assumed to give index values along the dimension with the same
            name.
        attrs : dict-like, optional
            Global attributes to save on this dataset.
        compat : {'broadcast_equals', 'equals', 'identical'}, optional
            String indicating how to compare variables of the same name for
            potential conflicts:

            - 'broadcast_equals': all values must be equal when variables are
              broadcast against each other to ensure common dimensions.
            - 'equals': all values and dimensions must be the same.
            - 'identical': all values, dimensions and attributes must be the
              same.
        """
        self._variables = OrderedDict()
        self._coord_names = set()
        self._dims = {}
        self._attrs = None
        self._file_obj = None
        if kwargs:
            if 'variables' in kwargs:
                data_vars = kwargs.pop('variables')
                warnings.warn('Variables kwarg is deprecated. Use data_vars', stacklevel=2)
            if kwargs:
                raise TypeError('{0} are not valid kwargs'.format(kwargs.keys()))
        if data_vars is None:
            data_vars = {}
        if coords is None:
            coords = set()
        if data_vars is not None or coords is not None:
            self._set_init_vars_and_dims(data_vars, coords, compat)
        if attrs is not None:
            self.attrs = attrs
        self._initialized = True

    def _add_missing_coords_inplace(self):
        """Add missing coordinates to self._variables
        """
        for dim, size in iteritems(self.dims):
            if dim not in self._variables:
                self._variables[dim] = default_index_coordinate(dim, size)

    def _update_vars_and_coords(self, new_variables, new_coord_names=None,
                                needs_copy=True, check_coord_names=True):
        """Add a dictionary of new variables to this dataset.

        Raises a ValueError if any dimensions have conflicting lengths in the
        new dataset. Otherwise will update this dataset's _variables and
        _dims attributes in-place.

        Set `needs_copy=False` only if this dataset is brand-new and hence
        can be thrown away if this method fails.
        """
        if new_coord_names is None:
            new_coord_names = {}
        # default to creating another copy of variables so can unroll if we end
        # up with inconsistent dimensions
        variables = self._variables.copy() if needs_copy else self._variables

        if check_coord_names:
            _assert_empty([k for k in self.data_vars if k in new_coord_names],
                          'coordinates with these names already exist as '
                          'variables: %s')

        variables.update(new_variables)
        dims = _calculate_dims(variables)
        # all checks are complete: it's safe to update
        self._variables = variables
        self._dims = dims
        self._add_missing_coords_inplace()
        self._coord_names.update(new_coord_names)

    def _set_init_vars_and_dims(self, vars, coords, compat):
        """Set the initial value of Dataset variables and dimensions
        """
        _assert_empty([k for k in vars if k in coords],
                      'redundant variables and coordinates: %s')
        variables = ChainMap(vars, coords)

        aligned = align_variables(variables)
        new_variables, new_coord_names = expand_variables(aligned,
                                                          compat=compat)

        new_coord_names.update(coords)
        self._update_vars_and_coords(new_variables, new_coord_names,
                                     needs_copy=False, check_coord_names=False)

    @classmethod
    def load_store(cls, store, decoder=None):
        """Create a new dataset from the contents of a backends.*DataStore
        object
        """
        variables, attributes = store.load()
        if decoder:
            variables, attributes = decoder(variables, attributes)
        obj = cls(variables, attrs=attributes)
        obj._file_obj = store
        return obj

    def close(self):
        """Close any files linked to this dataset
        """
        if self._file_obj is not None:
            self._file_obj.close()
        self._file_obj = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def __getstate__(self):
        """Always load data in-memory before pickling"""
        self.load()
        # self.__dict__ is the default pickle object, we don't need to
        # implement our own __setstate__ method to make pickle work
        state = self.__dict__.copy()
        # throw away any references to datastores in the pickle
        state['_file_obj'] = None
        return state

    @property
    def variables(self):
        """Frozen dictionary of xarray.Variable objects constituting this
        dataset's data
        """
        return Frozen(self._variables)

    def _attrs_copy(self):
        return None if self._attrs is None else OrderedDict(self._attrs)

    @property
    def attrs(self):
        """Dictionary of global attributes on this dataset
        """
        if self._attrs is None:
            self._attrs = OrderedDict()
        return self._attrs

    @attrs.setter
    def attrs(self, value):
        self._attrs = OrderedDict(value)

    @property
    def dims(self):
        """Mapping from dimension names to lengths.

        This dictionary cannot be modified directly, but is updated when adding
        new variables.
        """
        return Frozen(SortedKeysDict(self._dims))

    def load(self):
        """Manually trigger loading of this dataset's data from disk or a
        remote source into memory and return this dataset.

        Normally, it should not be necessary to call this method in user code,
        because all xarray functions should either work on deferred data or
        load data automatically. However, this method can be necessary when
        working with many file objects on disk.
        """
        # access .data to coerce everything to numpy or dask arrays
        all_data = dict((k, v.data) for k, v in self.variables.items())
        lazy_data = dict((k, v) for k, v in all_data.items()
                         if isinstance(v, dask_array_type))
        if lazy_data:
            import dask.array as da

            # evaluate all the dask arrays simultaneously
            evaluated_data = da.compute(*lazy_data.values())

            for k, data in zip(lazy_data, evaluated_data):
                self.variables[k].data = data

        return self

    def load_data(self):  # pragma: no cover
        warnings.warn('the Dataset method `load_data` has been deprecated; '
                      'use `load` instead',
                      FutureWarning, stacklevel=2)
        return self.load()

    @classmethod
    def _construct_direct(cls, variables, coord_names, dims, attrs,
                          file_obj=None):
        """Shortcut around __init__ for internal use when we want to skip
        costly validation
        """
        obj = object.__new__(cls)
        obj._variables = variables
        obj._coord_names = coord_names
        obj._dims = dims
        obj._attrs = attrs
        obj._file_obj = file_obj
        obj._initialized = True
        return obj

    __default_attrs = object()

    def _replace_vars_and_dims(self, variables, coord_names=None,
                               attrs=__default_attrs, inplace=False):
        """Fastpath constructor for internal use.

        Preserves coord names and attributes; dimensions are recalculated from
        the supplied variables.

        The arguments are *not* copied when placed on the new dataset. It is up
        to the caller to ensure that they have the right type and are not used
        elsewhere.

        Parameters
        ----------
        variables : OrderedDict
        coord_names : set or None, optional
        attrs : OrderedDict or None, optional

        Returns
        -------
        new : Dataset
        """
        dims = _calculate_dims(variables)
        if inplace:
            self._dims = dims
            self._variables = variables
            if coord_names is not None:
                self._coord_names = coord_names
            if attrs is not self.__default_attrs:
                self._attrs = attrs
            obj = self
        else:
            if coord_names is None:
                coord_names = self._coord_names.copy()
            if attrs is self.__default_attrs:
                attrs = self._attrs_copy()
            obj = self._construct_direct(variables, coord_names, dims, attrs)
        return obj

    def copy(self, deep=False):
        """Returns a copy of this dataset.

        If `deep=True`, a deep copy is made of each of the component variables.
        Otherwise, a shallow copy is made, so each variable in the new dataset
        is also a variable in the original dataset.
        """
        if deep:
            variables = OrderedDict((k, v.copy(deep=True))
                                    for k, v in iteritems(self._variables))
        else:
            variables = self._variables.copy()
        # skip __init__ to avoid costly validation
        return self._construct_direct(variables, self._coord_names.copy(),
                                      self._dims.copy(), self._attrs_copy())

    def _subset_with_all_valid_coords(self, variables, coord_names, attrs):
        needed_dims = set()
        for v in variables.values():
            needed_dims.update(v.dims)
        for k in self._coord_names:
            if set(self.variables[k].dims) <= needed_dims:
                variables[k] = self._variables[k]
                coord_names.add(k)
        dims = dict((k, self._dims[k]) for k in needed_dims)

        return self._construct_direct(variables, coord_names, dims, attrs)

    def _copy_listed(self, names):
        """Create a new Dataset with the listed variables from this dataset and
        the all relevant coordinates. Skips all validation.
        """
        variables = OrderedDict()
        coord_names = set()

        for name in names:
            try:
                variables[name] = self._variables[name]
            except KeyError:
                ref_name, var_name, var = _get_virtual_variable(
                    self._variables, name)
                variables[var_name] = var
                if ref_name in self._coord_names:
                    coord_names.add(var_name)

        return self._subset_with_all_valid_coords(variables, coord_names,
                                                  attrs=self.attrs.copy())

    def _construct_dataarray(self, name):
        """Construct a DataArray by indexing this dataset
        """
        from .dataarray import DataArray

        try:
            variable = self._variables[name]
        except KeyError:
            _, name, variable = _get_virtual_variable(self._variables, name)

        coords = OrderedDict()
        needed_dims = set(variable.dims)
        for k in self.coords:
            if set(self.variables[k].dims) <= needed_dims:
                coords[k] = self.variables[k]

        return DataArray(variable, coords, name=name, fastpath=True)

    def __copy__(self):
        return self.copy(deep=False)

    def __deepcopy__(self, memo=None):
        # memo does nothing but is required for compatibility with
        # copy.deepcopy
        return self.copy(deep=True)

    def __contains__(self, key):
        """The 'in' operator will return true or false depending on whether
        'key' is an array in the dataset or not.
        """
        return key in self._variables

    def __len__(self):
        return len(self._variables)

    def __iter__(self):
        return iter(self._variables)

    @property
    def nbytes(self):
        return sum(v.nbytes for v in self.variables.values())

    @property
    def loc(self):
        """Attribute for location based indexing. Only supports __getitem__,
        and only when the key is a dict of the form {dim: labels}.
        """
        return _LocIndexer(self)

    def __getitem__(self, key):
        """Access variables or coordinates this dataset as a
        :py:class:`~xarray.DataArray`.

        Indexing with a list of names will return a new ``Dataset`` object.
        """
        if utils.is_dict_like(key):
            return self.isel(**key)

        if hashable(key):
            return self._construct_dataarray(key)
        else:
            return self._copy_listed(np.asarray(key))

    def __setitem__(self, key, value):
        """Add an array to this dataset.

        If value is a `DataArray`, call its `select_vars()` method, rename it
        to `key` and merge the contents of the resulting dataset into this
        dataset.

        If value is an `Variable` object (or tuple of form
        ``(dims, data[, attrs])``), add it to this dataset as a new
        variable.
        """
        if utils.is_dict_like(key):
            raise NotImplementedError('cannot yet use a dictionary as a key '
                                      'to set Dataset values')
        self.update({key: value})

    def __delitem__(self, key):
        """Remove a variable from this dataset.

        If this variable is a dimension, all variables containing this
        dimension are also removed.
        """
        def remove(k):
            del self._variables[k]
            self._coord_names.discard(k)

        remove(key)

        if key in self._dims:
            del self._dims[key]
            also_delete = [k for k, v in iteritems(self._variables)
                           if key in v.dims]
            for key in also_delete:
                remove(key)

    # mutable objects should not be hashable
    __hash__ = None

    def _all_compat(self, other, compat_str):
        """Helper function for equals and identical"""
        # some stores (e.g., scipy) do not seem to preserve order, so don't
        # require matching order for equality
        def compat(x, y):
            return getattr(x, compat_str)(y)
        return (self._coord_names == other._coord_names and
                utils.dict_equiv(self._variables, other._variables,
                                 compat=compat))

    def broadcast_equals(self, other):
        """Two Datasets are broadcast equal if they are equal after
        broadcasting all variables against each other.

        For example, variables that are scalar in one dataset but non-scalar in
        the other dataset can still be broadcast equal if the the non-scalar
        variable is a constant.

        See Also
        --------
        Dataset.equals
        Dataset.identical
        """
        try:
            return self._all_compat(other, 'broadcast_equals')
        except (TypeError, AttributeError):
            return False

    def equals(self, other):
        """Two Datasets are equal if they have matching variables and
        coordinates, all of which are equal.

        Datasets can still be equal (like pandas objects) if they have NaN
        values in the same locations.

        This method is necessary because `v1 == v2` for ``Dataset``
        does element-wise comparisons (like numpy.ndarrays).

        See Also
        --------
        Dataset.broadcast_equals
        Dataset.identical
        """
        try:
            return self._all_compat(other, 'equals')
        except (TypeError, AttributeError):
            return False

    def identical(self, other):
        """Like equals, but also checks all dataset attributes and the
        attributes on all variables and coordinates.

        See Also
        --------
        Dataset.broadcast_equals
        Dataset.equals
        """
        try:
            return (utils.dict_equiv(self.attrs, other.attrs) and
                    self._all_compat(other, 'identical'))
        except (TypeError, AttributeError):
            return False

    @property
    def indexes(self):
        """OrderedDict of pandas.Index objects used for label based indexing
        """
        return Indexes(self)

    @property
    def coords(self):
        """Dictionary of xarray.DataArray objects corresponding to coordinate
        variables
        """
        return DatasetCoordinates(self)

    @property
    def data_vars(self):
        """Dictionary of xarray.DataArray objects corresponding to data variables
        """
        return DataVariables(self)

    @property
    def vars(self):  # pragma: no cover
        warnings.warn('the Dataset property `vars` has been deprecated; '
                      'use `data_vars` instead',
                      FutureWarning, stacklevel=2)
        return self.data_vars

    def set_coords(self, names, inplace=False):
        """Given names of one or more variables, set them as coordinates

        Parameters
        ----------
        names : str or list of str
            Name(s) of variables in this dataset to convert into coordinates.
        inplace : bool, optional
            If True, modify this dataset inplace. Otherwise, create a new
            object.

        Returns
        -------
        Dataset
        """
        # TODO: allow inserting new coordinates with this method, like
        # DataFrame.set_index?
        # nb. check in self._variables, not self.data_vars to insure that the
        # operation is idempotent
        if isinstance(names, basestring):
            names = [names]
        self._assert_all_in_dataset(names)
        obj = self if inplace else self.copy()
        obj._coord_names.update(names)
        return obj

    def reset_coords(self, names=None, drop=False, inplace=False):
        """Given names of coordinates, reset them to become variables

        Parameters
        ----------
        names : str or list of str, optional
            Name(s) of non-index coordinates in this dataset to reset into
            variables. By default, all non-index coordinates are reset.
        drop : bool, optional
            If True, remove coordinates instead of converting them into
            variables.
        inplace : bool, optional
            If True, modify this dataset inplace. Otherwise, create a new
            object.

        Returns
        -------
        Dataset
        """
        if names is None:
            names = self._coord_names - set(self.dims)
        else:
            if isinstance(names, basestring):
                names = [names]
            self._assert_all_in_dataset(names)
            _assert_empty(
                set(names) & set(self.dims),
                'cannot remove index coordinates with reset_coords: %s')
        obj = self if inplace else self.copy()
        obj._coord_names.difference_update(names)
        if drop:
            for name in names:
                del obj._variables[name]
        return obj

    def dump_to_store(self, store, encoder=None, sync=True, encoding=None):
        """Store dataset contents to a backends.*DataStore object."""
        if encoding is None:
            encoding = {}
        variables, attrs = conventions.encode_dataset_coordinates(self)

        check_encoding = set()
        for k, enc in encoding.items():
            # no need to shallow copy the variable again; that already happened
            # in encode_dataset_coordinates
            variables[k].encoding = enc
            check_encoding.add(k)

        if encoder:
            variables, attrs = encoder(variables, attrs)

        store.store(variables, attrs, check_encoding)
        if sync:
            store.sync()

    def to_netcdf(self, path=None, mode='w', format=None, group=None,
                  engine=None, encoding=None):
        """Write dataset contents to a netCDF file.

        Parameters
        ----------
        path : str, optional
            Path to which to save this dataset. If no path is provided, this
            function returns the resulting netCDF file as a bytes object; in
            this case, we need to use scipy.io.netcdf, which does not support
            netCDF version 4 (the default format becomes NETCDF3_64BIT).
        mode : {'w', 'a'}, optional
            Write ('w') or append ('a') mode. If mode='w', any existing file at
            this location will be overwritten.
        format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional
            File format for the resulting netCDF file:

            * NETCDF4: Data is stored in an HDF5 file, using netCDF4 API
              features.
            * NETCDF4_CLASSIC: Data is stored in an HDF5 file, using only
              netCDF 3 compatible API features.
            * NETCDF3_64BIT: 64-bit offset version of the netCDF 3 file format,
              which fully supports 2+ GB files, but is only compatible with
              clients linked against netCDF version 3.6.0 or later.
            * NETCDF3_CLASSIC: The classic netCDF 3 file format. It does not
              handle 2+ GB files very well.

            All formats are supported by the netCDF4-python library.
            scipy.io.netcdf only supports the last two formats.

            The default format is NETCDF4 if you are saving a file to disk and
            have the netCDF4-python library available. Otherwise, xarray falls
            back to using scipy to write netCDF files and defaults to the
            NETCDF3_64BIT format (scipy does not support netCDF4).
        group : str, optional
            Path to the netCDF4 group in the given file to open (only works for
            format='NETCDF4'). The group(s) will be created if necessary.
        engine : {'netcdf4', 'scipy', 'h5netcdf'}, optional
            Engine to use when writing netCDF files. If not provided, the
            default engine is chosen based on available dependencies, with a
            preference for 'netcdf4' if writing to a file on disk.
        encoding : dict, optional
            Nested dictionary with variable names as keys and dictionaries of
            variable specific encodings as values, e.g.,
            ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}``
        """
        if encoding is None:
            encoding = {}
        from ..backends.api import to_netcdf
        return to_netcdf(self, path, mode, format=format, group=group,
                         engine=engine, encoding=encoding)

    dump = utils.function_alias(to_netcdf, 'dump')
    dumps = utils.function_alias(to_netcdf, 'dumps')

    def __repr__(self):
        return formatting.dataset_repr(self)

    @property
    def chunks(self):
        """Block dimensions for this dataset's data or None if it's not a dask
        array.
        """
        chunks = {}
        for v in self.variables.values():
            if v.chunks is not None:
                new_chunks = list(zip(v.dims, v.chunks))
                if any(chunk != chunks[d] for d, chunk in new_chunks
                       if d in chunks):
                    raise ValueError('inconsistent chunks')
                chunks.update(new_chunks)
        return Frozen(SortedKeysDict(chunks))

    def chunk(self, chunks=None, name_prefix='xarray-', token=None,
              lock=False):
        """Coerce all arrays in this dataset into dask arrays with the given
        chunks.

        Non-dask arrays in this dataset will be converted to dask arrays. Dask
        arrays will be rechunked to the given chunk sizes.

        If neither chunks is not provided for one or more dimensions, chunk
        sizes along that dimension will not be updated; non-dask arrays will be
        converted into dask arrays with a single block.

        Parameters
        ----------
        chunks : int or dict, optional
            Chunk sizes along each dimension, e.g., ``5`` or
            ``{'x': 5, 'y': 5}``.
        name_prefix : str, optional
            Prefix for the name of any new dask arrays.
        token : str, optional
            Token uniquely identifying this dataset.
        lock : optional
            Passed on to :py:func:`dask.array.from_array`, if the array is not
            already as dask array.

        Returns
        -------
        chunked : xarray.Dataset
        """
        try:
            from dask.base import tokenize
        except ImportError:
            import dask  # raise the usual error if dask is entirely missing
            raise ImportError('xarray requires dask version 0.6 or newer')

        if isinstance(chunks, Number):
            chunks = dict.fromkeys(self.dims, chunks)

        if chunks is not None:
            bad_dims = [d for d in chunks if d not in self.dims]
            if bad_dims:
                raise ValueError('some chunks keys are not dimensions on this '
                                 'object: %s' % bad_dims)

        def selkeys(dict_, keys):
            if dict_ is None:
                return None
            return dict((d, dict_[d]) for d in keys if d in dict_)

        def maybe_chunk(name, var, chunks):
            chunks = selkeys(chunks, var.dims)
            if not chunks:
                chunks = None
            if var.ndim > 0:
                token2 = tokenize(name, token if token else var._data)
                name2 = '%s%s-%s' % (name_prefix, name, token2)
                return var.chunk(chunks, name=name2, lock=lock)
            else:
                return var

        variables = OrderedDict([(k, maybe_chunk(k, v, chunks))
                                 for k, v in self.variables.items()])
        return self._replace_vars_and_dims(variables)

    def isel(self, **indexers):
        """Returns a new dataset with each array indexed along the specified
        dimension(s).

        This method selects values from each array using its `__getitem__`
        method, except this method does not require knowing the order of
        each array's dimensions.

        Parameters
        ----------
        **indexers : {dim: indexer, ...}
            Keyword arguments with names matching dimensions and values given
            by integers, slice objects or arrays.

        Returns
        -------
        obj : Dataset
            A new Dataset with the same contents as this dataset, except each
            array and dimension is indexed by the appropriate indexers. In
            general, each array's data will be a view of the array's data
            in this dataset, unless numpy fancy indexing was triggered by using
            an array indexer, in which case the data will be a copy.

        See Also
        --------
        Dataset.sel
        Dataset.sel_points
        Dataset.isel_points
        DataArray.isel
        """
        invalid = [k for k in indexers if k not in self.dims]
        if invalid:
            raise ValueError("dimensions %r do not exist" % invalid)

        # all indexers should be int, slice or np.ndarrays
        indexers = [(k, (np.asarray(v)
                         if not isinstance(v, (int, np.integer, slice))
                         else v))
                    for k, v in iteritems(indexers)]

        variables = OrderedDict()
        for name, var in iteritems(self._variables):
            var_indexers = dict((k, v) for k, v in indexers if k in var.dims)
            variables[name] = var.isel(**var_indexers)
        return self._replace_vars_and_dims(variables)

    def sel(self, method=None, tolerance=None, **indexers):
        """Returns a new dataset with each array indexed by tick labels
        along the specified dimension(s).

        In contrast to `Dataset.isel`, indexers for this method should use
        labels instead of integers.

        Under the hood, this method is powered by using pandas's powerful Index
        objects. This makes label based indexing essentially just as fast as
        using integer indexing.

        It also means this method uses pandas's (well documented) logic for
        indexing. This means you can use string shortcuts for datetime indexes
        (e.g., '2000-01' to select all values in January 2000). It also means
        that slices are treated as inclusive of both the start and stop values,
        unlike normal Python indexing.

        Parameters
        ----------
        method : {None, 'nearest', 'pad'/'ffill', 'backfill'/'bfill'}, optional
            Method to use for inexact matches (requires pandas>=0.16):

            * None (default): only exact matches
            * pad / ffill: propgate last valid index value forward
            * backfill / bfill: propagate next valid index value backward
            * nearest: use nearest valid index value
        tolerance : optional
            Maximum distance between original and new labels for inexact
            matches. The values of the index at the matching locations most
            satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
            Requires pandas>=0.17.
        **indexers : {dim: indexer, ...}
            Keyword arguments with names matching dimensions and values given
            by scalars, slices or arrays of tick labels.

        Returns
        -------
        obj : Dataset
            A new Dataset with the same contents as this dataset, except each
            variable and dimension is indexed by the appropriate indexers. In
            general, each variable's data will be a view of the variable's data
            in this dataset, unless numpy fancy indexing was triggered by using
            an array indexer, in which case the data will be a copy.

        See Also
        --------
        Dataset.isel
        Dataset.sel_points
        Dataset.isel_points
        DataArray.sel
        """
        return self.isel(**indexing.remap_label_indexers(
            self, indexers, method=method, tolerance=tolerance))

    def isel_points(self, dim='points', **indexers):
        """Returns a new dataset with each array indexed pointwise along the
        specified dimension(s).

        This method selects pointwise values from each array and is akin to
        the NumPy indexing behavior of `arr[[0, 1], [0, 1]]`, except this
        method does not require knowing the order of each array's dimensions.

        Parameters
        ----------
        dim : str or DataArray or pandas.Index or other list-like object, optional
            Name of the dimension to concatenate along. If dim is provided as a
            string, it must be a new dimension name, in which case it is added
            along axis=0. If dim is provided as a DataArray or Index or
            list-like object, its name, which must not be present in the
            dataset, is used as the dimension to concatenate along and the
            values are added as a coordinate.
        **indexers : {dim: indexer, ...}
            Keyword arguments with names matching dimensions and values given
            by array-like objects. All indexers must be the same length and
            1 dimensional.

        Returns
        -------
        obj : Dataset
            A new Dataset with the same contents as this dataset, except each
            array and dimension is indexed by the appropriate indexers. With
            pointwise indexing, the new Dataset will always be a copy of the
            original.

        See Also
        --------
        Dataset.sel
        Dataset.isel
        Dataset.sel_points
        DataArray.isel_points
        """
        indexer_dims = set(indexers)

        def relevant_keys(mapping):
            return [k for k, v in mapping.items()
                    if any(d in indexer_dims for d in v.dims)]

        data_vars = relevant_keys(self.data_vars)
        coords = relevant_keys(self.coords)

        # all the indexers should be iterables
        keys = indexers.keys()
        indexers = [(k, np.asarray(v)) for k, v in iteritems(indexers)]
        # Check that indexers are valid dims, integers, and 1D
        for k, v in indexers:
            if k not in self.dims:
                raise ValueError("dimension %s does not exist" % k)
            if v.dtype.kind != 'i':
                raise TypeError('Indexers must be integers')
            if v.ndim != 1:
                raise ValueError('Indexers must be 1 dimensional')

        # all the indexers should have the same length
        lengths = set(len(v) for k, v in indexers)
        if len(lengths) > 1:
            raise ValueError('All indexers must be the same length')

        # Existing dimensions are not valid choices for the dim argument
        if isinstance(dim, basestring):
            if dim in self.dims:
                # dim is an invalid string
                raise ValueError('Existing dimension names are not valid '
                                 'choices for the dim argument in sel_points')
        elif hasattr(dim, 'dims'):
            # dim is a DataArray or Coordinate
            if dim.name in self.dims:
                # dim already exists
                raise ValueError('Existing dimensions are not valid choices '
                                 'for the dim argument in sel_points')
        else:
            # try to cast dim to DataArray with name = points
            from .dataarray import DataArray
            dim = DataArray(dim, dims='points', name='points')

        # TODO: This would be sped up with vectorized indexing. This will
        # require dask to support pointwise indexing as well.
        return concat([self.isel(**d) for d in
                       [dict(zip(keys, inds)) for inds in
                        zip(*[v for k, v in indexers])]],
                      dim=dim, coords=coords, data_vars=data_vars)

    def sel_points(self, dim='points', method=None, tolerance=None,
                   **indexers):
        """Returns a new dataset with each array indexed pointwise by tick
        labels along the specified dimension(s).

        In contrast to `Dataset.isel_points`, indexers for this method should
        use labels instead of integers.

        In contrast to `Dataset.sel`, this method selects points along the
        diagonal of multi-dimensional arrays, not the intersection.

        Parameters
        ----------
        dim : str or DataArray or pandas.Index or other list-like object, optional
            Name of the dimension to concatenate along. If dim is provided as a
            string, it must be a new dimension name, in which case it is added
            along axis=0. If dim is provided as a DataArray or Index or
            list-like object, its name, which must not be present in the
            dataset, is used as the dimension to concatenate along and the
            values are added as a coordinate.
        method : {None, 'nearest', 'pad'/'ffill', 'backfill'/'bfill'}, optional
            Method to use for inexact matches (requires pandas>=0.16):

            * None (default): only exact matches
            * pad / ffill: propagate last valid index value forward
            * backfill / bfill: propagate next valid index value backward
            * nearest: use nearest valid index value
        tolerance : optional
            Maximum distance between original and new labels for inexact
            matches. The values of the index at the matching locations most
            satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
            Requires pandas>=0.17.
        **indexers : {dim: indexer, ...}
            Keyword arguments with names matching dimensions and values given
            by array-like objects. All indexers must be the same length and
            1 dimensional.

        Returns
        -------
        obj : Dataset
            A new Dataset with the same contents as this dataset, except each
            array and dimension is indexed by the appropriate indexers. With
            pointwise indexing, the new Dataset will always be a copy of the
            original.

        See Also
        --------
        Dataset.sel
        Dataset.isel
        Dataset.isel_points
        DataArray.sel_points
        """
        pos_indexers = indexing.remap_label_indexers(
            self, indexers, method=method, tolerance=tolerance)
        return self.isel_points(dim=dim, **pos_indexers)

    def reindex_like(self, other, method=None, tolerance=None, copy=True):
        """Conform this object onto the indexes of another object, filling
        in missing values with NaN.

        Parameters
        ----------
        other : Dataset or DataArray
            Object with an 'indexes' attribute giving a mapping from dimension
            names to pandas.Index objects, which provides coordinates upon
            which to index the variables in this dataset. The indexes on this
            other object need not be the same as the indexes on this
            dataset. Any mis-matched index values will be filled in with
            NaN, and any mis-matched dimension names will simply be ignored.
        method : {None, 'nearest', 'pad'/'ffill', 'backfill'/'bfill'}, optional
            Method to use for filling index values from other not found in this
            dataset:

            * None (default): don't fill gaps
            * pad / ffill: propagate last valid index value forward
            * backfill / bfill: propagate next valid index value backward
            * nearest: use nearest valid index value (requires pandas>=0.16)
        tolerance : optional
            Maximum distance between original and new labels for inexact
            matches. The values of the index at the matching locations most
            satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
            Requires pandas>=0.17.
        copy : bool, optional
            If `copy=True`, the returned dataset contains only copied
            variables. If `copy=False` and no reindexing is required then
            original variables from this dataset are returned.

        Returns
        -------
        reindexed : Dataset
            Another dataset, with this dataset's data but coordinates from the
            other object.

        See Also
        --------
        Dataset.reindex
        align
        """
        indexers = dict((k, v) for k, v in other.indexes.items()
                        if k in self.dims)
        return self.reindex(method=method, copy=copy, tolerance=tolerance,
                            **indexers)

    def reindex(self, indexers=None, method=None, tolerance=None, copy=True, **kw_indexers):
        """Conform this object onto a new set of indexes, filling in
        missing values with NaN.

        Parameters
        ----------
        indexers : dict. optional
            Dictionary with keys given by dimension names and values given by
            arrays of coordinates tick labels. Any mis-matched coordinate values
            will be filled in with NaN, and any mis-matched dimension names will
            simply be ignored.
        method : {None, 'nearest', 'pad'/'ffill', 'backfill'/'bfill'}, optional
            Method to use for filling index values in ``indexers`` not found in
            this dataset:

            * None (default): don't fill gaps
            * pad / ffill: propagate last valid index value forward
            * backfill / bfill: propagate next valid index value backward
            * nearest: use nearest valid index value (requires pandas>=0.16)
        tolerance : optional
            Maximum distance between original and new labels for inexact
            matches. The values of the index at the matching locations most
            satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
            Requires pandas>=0.17.
        copy : bool, optional
            If `copy=True`, the returned dataset contains only copied
            variables. If `copy=False` and no reindexing is required then
            original variables from this dataset are returned.
        **kw_indexers : optional
            Keyword arguments in the same form as ``indexers``.

        Returns
        -------
        reindexed : Dataset
            Another dataset, with this dataset's data but replaced coordinates.

        See Also
        --------
        Dataset.reindex_like
        align
        pandas.Index.get_indexer
        """
        indexers = utils.combine_pos_and_kw_args(indexers, kw_indexers,
                                                 'reindex')
        if not indexers:
            # shortcut
            return self.copy(deep=True) if copy else self

        bad_dims = [d for d in indexers if d not in self.dims]
        if bad_dims:
            raise ValueError('invalid reindex dimensions: %s' % bad_dims)

        variables = alignment.reindex_variables(
            self.variables, self.indexes, indexers, method, tolerance, copy=copy)
        return self._replace_vars_and_dims(variables)

    def rename(self, name_dict, inplace=False):
        """Returns a new object with renamed variables and dimensions.

        Parameters
        ----------
        name_dict : dict-like
            Dictionary whose keys are current variable or dimension names and
            whose values are new names.
        inplace : bool, optional
            If True, rename variables and dimensions in-place. Otherwise,
            return a new dataset object.

        Returns
        -------
        renamed : Dataset
            Dataset with renamed variables and dimensions.

        See Also
        --------

        Dataset.swap_dims
        DataArray.rename
        """
        for k, v in name_dict.items():
            if k not in self:
                raise ValueError("cannot rename %r because it is not a "
                                 "variable in this dataset" % k)
            if v in self:
                raise ValueError('the new name %r already exists' % v)

        variables = OrderedDict()
        coord_names = set()
        for k, v in iteritems(self._variables):
            name = name_dict.get(k, k)
            dims = tuple(name_dict.get(dim, dim) for dim in v.dims)
            var = v.copy(deep=False)
            var.dims = dims
            variables[name] = var
            if k in self._coord_names:
                coord_names.add(name)

        return self._replace_vars_and_dims(variables, coord_names,
                                           inplace=inplace)

    def swap_dims(self, dims_dict, inplace=False):
        """Returns a new object with swapped dimensions.

        Parameters
        ----------
        dims_dict : dict-like
            Dictionary whose keys are current dimension names and whose values
            are new names. Each value must already be a variable in the
            dataset.
        inplace : bool, optional
            If True, swap dimensions in-place. Otherwise, return a new dataset
            object.

        Returns
        -------
        renamed : Dataset
            Dataset with swapped dimensions.

        See Also
        --------

        Dataset.rename
        DataArray.swap_dims
        """
        for k, v in dims_dict.items():
            if k not in self.dims:
                raise ValueError('cannot swap from dimension %r because it is '
                                 'not an existing dimension' % k)
            if self.variables[v].dims != (k,):
                raise ValueError('replacement dimension %r is not a 1D '
                                 'variable along the old dimension %r'
                                 % (v, k))

        result_dims = set(dims_dict.get(dim, dim) for dim in self.dims)

        variables = OrderedDict()

        coord_names = self._coord_names.copy()
        coord_names.update(dims_dict.values())

        for k, v in iteritems(self.variables):
            dims = tuple(dims_dict.get(dim, dim) for dim in v.dims)
            var = v.to_coord() if k in result_dims else v.to_variable()
            var.dims = dims
            variables[k] = var

        return self._replace_vars_and_dims(variables, coord_names,
                                           inplace=inplace)

    def _stack_once(self, dims, new_dim):
        variables = OrderedDict()
        for name, var in self.variables.items():
            if name not in dims:
                if any(d in var.dims for d in dims):
                    add_dims = [d for d in dims if d not in var.dims]
                    vdims = list(var.dims) + add_dims
                    shape = [self.dims[d] for d in vdims]
                    exp_var = var.expand_dims(vdims, shape)
                    stacked_var = exp_var.stack(**{new_dim: dims})
                    variables[name] = stacked_var
                else:
                    variables[name] = var.copy(deep=False)

        idx = pd.MultiIndex.from_product([self.indexes[d] for d in dims],
                                         names=dims)
        variables[new_dim] = Coordinate(new_dim, idx)

        coord_names = set(self._coord_names) - set(dims) | set([new_dim])

        return self._replace_vars_and_dims(variables, coord_names)

    def stack(self, **dimensions):
        """
        Stack any number of existing dimensions into a single new dimension.

        New dimensions will be added at the end, and the corresponding
        coordinate variables will be combined into a MultiIndex.

        Parameters
        ----------
        **dimensions : keyword arguments of the form new_name=(dim1, dim2, ...)
            Names of new dimensions, and the existing dimensions that they
            replace.

        Returns
        -------
        stacked : Dataset
            Dataset with stacked data.

        See also
        --------
        Dataset.unstack
        """
        result = self
        for new_dim, dims in dimensions.items():
            result = result._stack_once(dims, new_dim)
        return result

    def unstack(self, dim):
        """
        Unstack an existing dimension corresponding to a MultiIndex into
        multiple new dimensions.

        New dimensions will be added at the end.

        Parameters
        ----------
        dim : str
            Name of the existing dimension to unstack.

        Returns
        -------
        unstacked : Dataset
            Dataset with unstacked data.

        See also
        --------
        Dataset.stack
        """
        if dim not in self.dims:
            raise ValueError('invalid dimension: %s' % dim)

        index = self.indexes[dim]
        if not isinstance(index, pd.MultiIndex):
            raise ValueError('cannot unstack a dimension that does not have '
                             'a MultiIndex')

        full_idx = pd.MultiIndex.from_product(index.levels, names=index.names)
        obj = self.reindex(copy=False, **{dim: full_idx})

        new_dim_names = index.names
        if any(name is None for name in new_dim_names):
            raise ValueError('cannot unstack dimension with unnamed levels')

        new_dim_sizes = [lev.size for lev in index.levels]

        variables = OrderedDict()
        for name, var in obj.variables.items():
            if name != dim:
                if dim in var.dims:
                    new_dims = OrderedDict(zip(new_dim_names, new_dim_sizes))
                    variables[name] = var.unstack(**{dim: new_dims})
                else:
                    variables[name] = var

        for name, lev in zip(new_dim_names, index.levels):
            variables[name] = Coordinate(name, lev)

        coord_names = set(self._coord_names) - set([dim]) | set(new_dim_names)

        return self._replace_vars_and_dims(variables, coord_names)

    def update(self, other, inplace=True):
        """Update this dataset's variables with those from another dataset.

        Parameters
        ----------
        other : Dataset or castable to Dataset
            Dataset or variables with which to update this dataset.
        inplace : bool, optional
            If True, merge the other dataset into this dataset in-place.
            Otherwise, return a new dataset object.

        Returns
        -------
        updated : Dataset
            Updated dataset.

        Raises
        ------
        ValueError
            If any dimensions would have inconsistent sizes in the updated
            dataset.
        """
        return self.merge(
            other, inplace=inplace, overwrite_vars=list(other), join='left')

    def merge(self, other, inplace=False, overwrite_vars=set(),
              compat='broadcast_equals', join='outer'):
        """Merge the arrays of two datasets into a single dataset.

        This method generally not allow for overriding data, with the exception
        of attributes, which are ignored on the second dataset. Variables with
        the same name are checked for conflicts via the equals or identical
        methods.

        Parameters
        ----------
        other : Dataset or castable to Dataset
            Dataset or variables to merge with this dataset.
        inplace : bool, optional
            If True, merge the other dataset into this dataset in-place.
            Otherwise, return a new dataset object.
        overwrite_vars : str or sequence, optional
            If provided, update variables of these name(s) without checking for
            conflicts in this dataset.
        compat : {'broadcast_equals', 'equals', 'identical'}, optional
            String indicating how to compare variables of the same name for
            potential conflicts:

            - 'broadcast_equals': all values must be equal when variables are
              broadcast against each other to ensure common dimensions.
            - 'equals': all values and dimensions must be the same.
            - 'identical': all values, dimensions and attributes must be the
              same.
        join : {'outer', 'inner', 'left', 'right'}, optional
            Method for joining ``self`` and ``other`` along shared dimensions:

            - 'outer': use the union of the indexes
            - 'inner': use the intersection of the indexes
            - 'left': use indexes from ``self``
            - 'right': use indexes from ``other``

        Returns
        -------
        merged : Dataset
            Merged dataset.

        Raises
        ------
        ValueError
            If any variables conflict (see ``compat``).
        """
        replace_vars, new_coord_names = merge_datasets(
            self, other, overwrite_vars, compat=compat, join=join)
        obj = self if inplace else self.copy()
        obj._update_vars_and_coords(replace_vars, new_coord_names)
        return obj

    def _assert_all_in_dataset(self, names, virtual_okay=False):
        bad_names = set(names) - set(self._variables)
        if virtual_okay:
            bad_names -= self.virtual_variables
        if bad_names:
            raise ValueError('One or more of the specified variables '
                             'cannot be found in this dataset')

    def drop(self, labels, dim=None):
        """Drop variables or index labels from this dataset.

        If a variable corresponding to a dimension is dropped, all variables
        that use that dimension are also dropped.

        Parameters
        ----------
        labels : str
            Names of variables or index labels to drop.
        dim : None or str, optional
            Dimension along which to drop index labels. By default (if
            ``dim is None``), drops variables rather than index labels.

        Returns
        -------
        dropped : Dataset
        """
        if utils.is_scalar(labels):
            labels = [labels]
        if dim is None:
            return self._drop_vars(labels)
        else:
            new_index = self.indexes[dim].drop(labels)
            return self.loc[{dim: new_index}]

    def _drop_vars(self, names):
        self._assert_all_in_dataset(names)
        drop = set(names)
        drop |= set(k for k, v in iteritems(self._variables)
                    if any(name in v.dims for name in names))
        variables = OrderedDict((k, v) for k, v in iteritems(self._variables)
                                if k not in drop)
        coord_names = set(k for k in self._coord_names if k in variables)
        return self._replace_vars_and_dims(variables, coord_names)

    def drop_vars(self, *names):  # pragma: no cover
        warnings.warn('the Dataset method `drop_vars` has been deprecated; '
                      'use `drop` instead',
                      FutureWarning, stacklevel=2)
        return self.drop(names)

    def transpose(self, *dims):
        """Return a new Dataset object with all array dimensions transposed.

        Although the order of dimensions on each array will change, the dataset
        dimensions themselves will remain in fixed (sorted) order.

        Parameters
        ----------
        *dims : str, optional
            By default, reverse the dimensions on each array. Otherwise,
            reorder the dimensions to this order.

        Returns
        -------
        transposed : Dataset
            Each array in the dataset (including) coordinates will be
            transposed to the given order.

        Notes
        -----
        Although this operation returns a view of each array's data, it
        is not lazy -- the data will be fully loaded into memory.

        See Also
        --------
        numpy.transpose
        DataArray.transpose
        """
        if dims:
            if set(dims) ^ set(self.dims):
                raise ValueError('arguments to transpose (%s) must be '
                                 'permuted dataset dimensions (%s)'
                                 % (dims, tuple(self.dims)))
        ds = self.copy()
        for name, var in iteritems(self._variables):
            var_dims = tuple(dim for dim in dims if dim in var.dims)
            ds._variables[name] = var.transpose(*var_dims)
        return ds

    @property
    def T(self):
        return self.transpose()

    def squeeze(self, dim=None):
        """Returns a new dataset with squeezed data.

        Parameters
        ----------
        dim : None or str or tuple of str, optional
            Selects a subset of the length one dimensions. If a dimension is
            selected with length greater than one, an error is raised.  If
            None, all length one dimensions are squeezed.

        Returns
        -------
        squeezed : Dataset
            This dataset, but with with all or a subset of the dimensions of
            length 1 removed.

        Notes
        -----
        Although this operation returns a view of each variable's data, it is
        not lazy -- all variable data will be fully loaded.

        See Also
        --------
        numpy.squeeze
        """
        return common.squeeze(self, self.dims, dim)

    def dropna(self, dim, how='any', thresh=None, subset=None):
        """Returns a new dataset with dropped labels for missing values along
        the provided dimension.

        Parameters
        ----------
        dim : str
            Dimension along which to drop missing values. Dropping along
            multiple dimensions simultaneously is not yet supported.
        how : {'any', 'all'}, optional
            * any : if any NA values are present, drop that label
            * all : if all values are NA, drop that label
        thresh : int, default None
            If supplied, require this many non-NA values.
        subset : sequence, optional
            Subset of variables to check for missing values. By default, all
            variables in the dataset are checked.

        Returns
        -------
        Dataset
        """
        # TODO: consider supporting multiple dimensions? Or not, given that
        # there are some ugly edge cases, e.g., pandas's dropna differs
        # depending on the order of the supplied axes.

        if dim not in self.dims:
            raise ValueError('%s must be a single dataset dimension' % dim)

        if subset is None:
            subset = list(self.data_vars)

        count = np.zeros(self.dims[dim], dtype=np.int64)
        size = 0

        for k in subset:
            array = self._variables[k]
            if dim in array.dims:
                dims = [d for d in array.dims if d != dim]
                count += array.count(dims)
                size += np.prod([self.dims[d] for d in dims])

        if thresh is not None:
            mask = count >= thresh
        elif how == 'any':
            mask = count == size
        elif how == 'all':
            mask = count > 0
        elif how is not None:
            raise ValueError('invalid how option: %s' % how)
        else:
            raise TypeError('must specify how or thresh')

        return self.isel(**{dim: mask})

    def fillna(self, value):
        """Fill missing values in this object.

        This operation follows the normal broadcasting and alignment rules that
        xarray uses for binary arithmetic, except the result is aligned to this
        object (``join='left'``) instead of aligned to the intersection of
        index coordinates (``join='inner'``).

        Parameters
        ----------
        value : scalar, ndarray, DataArray, dict or Dataset
            Used to fill all matching missing values in this dataset's data
            variables. Scalars, ndarrays or DataArrays arguments are used to
            fill all data with aligned coordinates (for DataArrays).
            Dictionaries or datasets match data variables and then align
            coordinates if necessary.

        Returns
        -------
        Dataset
        """
        return self._fillna(value)

    def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False,
               allow_lazy=False, **kwargs):
        """Reduce this dataset by applying `func` along some dimension(s).

        Parameters
        ----------
        func : function
            Function which can be called in the form
            `f(x, axis=axis, **kwargs)` to return the result of reducing an
            np.ndarray over an integer valued axis.
        dim : str or sequence of str, optional
            Dimension(s) over which to apply `func`.  By default `func` is
            applied over all dimensions.
        keep_attrs : bool, optional
            If True, the dataset's attributes (`attrs`) will be copied from
            the original object to the new one.  If False (default), the new
            object will be returned without attributes.
        numeric_only : bool, optional
            If True, only apply ``func`` to variables with a numeric dtype.
        **kwargs : dict
            Additional keyword arguments passed on to ``func``.

        Returns
        -------
        reduced : Dataset
            Dataset with this object's DataArrays replaced with new DataArrays
            of summarized data and the indicated dimension(s) removed.
        """
        if isinstance(dim, basestring):
            dims = set([dim])
        elif dim is None:
            dims = set(self.dims)
        else:
            dims = set(dim)

        _assert_empty([dim for dim in dims if dim not in self.dims],
                      'Dataset does not contain the dimensions: %s')

        variables = OrderedDict()
        for name, var in iteritems(self._variables):
            reduce_dims = [dim for dim in var.dims if dim in dims]
            if reduce_dims or not var.dims:
                if name not in self.coords:
                    if (not numeric_only or
                        np.issubdtype(var.dtype, np.number) or
                            var.dtype == np.bool_):
                        if len(reduce_dims) == 1:
                            # unpack dimensions for the benefit of functions
                            # like np.argmin which can't handle tuple arguments
                            reduce_dims, = reduce_dims
                        elif len(reduce_dims) == var.ndim:
                            # prefer to aggregate over axis=None rather than
                            # axis=(0, 1) if they will be equivalent, because
                            # the former is often more efficient
                            reduce_dims = None
                        variables[name] = var.reduce(func, dim=reduce_dims,
                                                     keep_attrs=keep_attrs,
                                                     allow_lazy=allow_lazy,
                                                     **kwargs)
            else:
                variables[name] = var

        coord_names = set(k for k in self.coords if k in variables)
        attrs = self.attrs if keep_attrs else None
        return self._replace_vars_and_dims(variables, coord_names, attrs)

    def apply(self, func, keep_attrs=False, args=(), **kwargs):
        """Apply a function over the data variables in this dataset.

        Parameters
        ----------
        func : function
            Function which can be called in the form `f(x, **kwargs)` to
            transform each DataArray `x` in this dataset into another
            DataArray.
        keep_attrs : bool, optional
            If True, the dataset's attributes (`attrs`) will be copied from
            the original object to the new one. If False, the new object will
            be returned without attributes.
        args : tuple, optional
            Positional arguments passed on to `func`.
        **kwargs : dict
            Keyword arguments passed on to `func`.

        Returns
        -------
        applied : Dataset
            Resulting dataset from applying ``func`` over each data variable.
        """
        variables = OrderedDict(
            (k, maybe_wrap_array(v, func(v, *args, **kwargs)))
            for k, v in iteritems(self.data_vars))
        attrs = self.attrs if keep_attrs else None
        return type(self)(variables, attrs=attrs)

    def assign(self, **kwargs):
        """Assign new data variables to a Dataset, returning a new object
        with all the original variables in addition to the new ones.

        Parameters
        ----------
        kwargs : keyword, value pairs
            keywords are the variables names. If the values are callable, they
            are computed on the Dataset and assigned to new data variables. If
            the values are not callable, (e.g. a DataArray, scalar, or array),
            they are simply assigned.

        Returns
        -------
        ds : Dataset
            A new Dataset with the new variables in addition to all the
            existing variables.

        Notes
        -----
        Since ``kwargs`` is a dictionary, the order of your arguments may not
        be preserved, and so the order of the new variables is not well
        defined. Assigning multiple variables within the same ``assign`` is
        possible, but you cannot reference other variables created within the
        same ``assign`` call.

        See Also
        --------
        pandas.DataFrame.assign
        """
        data = self.copy()
        # do all calculations first...
        results = data._calc_assign_results(kwargs)
        # ... and then assign
        data.update(results)
        return data

    def to_array(self, dim='variable', name=None):
        """Convert this dataset into an xarray.DataArray

        The data variables of this dataset will be broadcast against each other
        and stacked along the first axis of the new array. All coordinates of
        this dataset will remain coordinates.

        Parameters
        ----------
        dim : str, optional
            Name of the new dimension.
        name : str, optional
            Name of the new data array.

        Returns
        -------
        array : xarray.DataArray
        """
        from .dataarray import DataArray

        data_vars = [self.variables[k] for k in self.data_vars]
        broadcast_vars = broadcast_variables(*data_vars)
        data = ops.stack([b.data for b in broadcast_vars], axis=0)

        coords = dict(self.coords)
        coords[dim] = list(self.data_vars)

        dims = (dim,) + broadcast_vars[0].dims

        return DataArray(data, coords, dims, attrs=self.attrs, name=name)

    def _to_dataframe(self, ordered_dims):
        columns = [k for k in self if k not in self.dims]
        data = [self._variables[k].expand_dims(ordered_dims).values.reshape(-1)
                for k in columns]
        index = self.coords.to_index(ordered_dims)
        return pd.DataFrame(OrderedDict(zip(columns, data)), index=index)

    def to_dataframe(self):
        """Convert this dataset into a pandas.DataFrame.

        Non-index variables in this dataset form the columns of the
        DataFrame. The DataFrame is be indexed by the Cartesian product of
        this dataset's indices.
        """
        return self._to_dataframe(self.dims)

    @classmethod
    def from_dataframe(cls, dataframe):
        """Convert a pandas.DataFrame into an xarray.Dataset

        Each column will be converted into an independent variable in the
        Dataset. If the dataframe's index is a MultiIndex, it will be expanded
        into a tensor product of one-dimensional indices (filling in missing
        values with NaN). This method will produce a Dataset very similar to
        that on which the 'to_dataframe' method was called, except with
        possibly redundant dimensions (since all dataset variables will have
        the same dimensionality).
        """
        # TODO: Add an option to remove dimensions along which the variables
        # are constant, to enable consistent serialization to/from a dataframe,
        # even if some variables have different dimensionality.

        if not dataframe.columns.is_unique:
            raise ValueError(
                'cannot convert DataFrame with non-unique columns')

        idx = dataframe.index
        obj = cls()

        if hasattr(idx, 'levels'):
            # it's a multi-index
            # expand the DataFrame to include the product of all levels
            full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names)
            dataframe = dataframe.reindex(full_idx)
            dims = [name if name is not None else 'level_%i' % n
                    for n, name in enumerate(idx.names)]
            for dim, lev in zip(dims, idx.levels):
                obj[dim] = (dim, lev)
            shape = [lev.size for lev in idx.levels]
        else:
            dims = (idx.name if idx.name is not None else 'index',)
            obj[dims[0]] = (dims, idx)
            shape = -1

        for name, series in iteritems(dataframe):
            data = series.values.reshape(shape)
            obj[name] = (dims, data)
        return obj

    @staticmethod
    def _unary_op(f, keep_attrs=False):
        @functools.wraps(f)
        def func(self, *args, **kwargs):
            ds = self.coords.to_dataset()
            for k in self.data_vars:
                ds._variables[k] = f(self._variables[k], *args, **kwargs)
            if keep_attrs:
                ds._attrs = self._attrs
            return ds
        return func

    @staticmethod
    def _binary_op(f, reflexive=False, join='inner', drop_na_vars=True):
        @functools.wraps(f)
        def func(self, other):
            if isinstance(other, groupby.GroupBy):
                return NotImplemented
            if hasattr(other, 'indexes'):
                self, other = align(self, other, join=join, copy=False)
                empty_indexes = [d for d, s in self.dims.items() if s == 0]
                if empty_indexes:
                    raise ValueError('no overlapping labels for some '
                                     'dimensions: %s' % empty_indexes)
            g = f if not reflexive else lambda x, y: f(y, x)
            ds = self._calculate_binary_op(g, other, drop_na_vars=drop_na_vars)
            return ds
        return func

    @staticmethod
    def _inplace_binary_op(f):
        @functools.wraps(f)
        def func(self, other):
            if isinstance(other, groupby.GroupBy):
                raise TypeError('in-place operations between a Dataset and '
                                'a grouped object are not permitted')
            if hasattr(other, 'indexes'):
                other = other.reindex_like(self, copy=False)
            # we don't want to actually modify arrays in-place
            g = ops.inplace_to_noninplace_op(f)
            ds = self._calculate_binary_op(g, other, inplace=True)
            self._replace_vars_and_dims(ds._variables, ds._coord_names,
                                        ds._attrs, inplace=True)
            return self
        return func

    def _calculate_binary_op(self, f, other, inplace=False, drop_na_vars=True):

        def apply_over_both(lhs_data_vars, rhs_data_vars, lhs_vars, rhs_vars):
            dest_vars = OrderedDict()
            performed_op = False
            for k in lhs_data_vars:
                if k in rhs_data_vars:
                    dest_vars[k] = f(lhs_vars[k], rhs_vars[k])
                    performed_op = True
                elif inplace:
                    raise ValueError(
                        'datasets must have the same data variables '
                        'for in-place arithmetic operations: %s, %s'
                        % (list(lhs_data_vars), list(rhs_data_vars)))
                elif not drop_na_vars:
                    # this shortcuts left alignment of variables for fillna
                    dest_vars[k] = lhs_vars[k]
            if not performed_op:
                raise ValueError(
                    'datasets have no overlapping data variables: %s, %s'
                    % (list(lhs_data_vars), list(rhs_data_vars)))
            return dest_vars

        if utils.is_dict_like(other) and not isinstance(other, Dataset):
            # can't use our shortcut of doing the binary operation with
            # Variable objects, so apply over our data vars instead.
            new_data_vars = apply_over_both(self.data_vars, other,
                                            self.data_vars, other)
            return Dataset(new_data_vars)

        other_coords = getattr(other, 'coords', None)
        ds = self.coords.merge(other_coords)

        if isinstance(other, Dataset):
            new_vars = apply_over_both(self.data_vars, other.data_vars,
                                       self.variables, other.variables)
        else:
            other_variable = getattr(other, 'variable', other)
            new_vars = OrderedDict((k, f(self.variables[k], other_variable))
                                   for k in self.data_vars)

        ds._variables.update(new_vars)
        return ds

    def diff(self, dim, n=1, label='upper'):
        """Calculate the n-th order discrete difference along given axis.

        Parameters
        ----------
        dim : str, optional
            Dimension over which to calculate the finite difference.
        n : int, optional
            The number of times values are differenced.
        label : str, optional
            The new coordinate in dimension ``dim`` will have the
            values of either the minuend's or subtrahend's coordinate
            for values 'upper' and 'lower', respectively.  Other
            values are not supported.

        Returns
        -------
        difference : same type as caller
            The n-th order finite difference of this object.

        Examples
        --------
        >>> ds = xr.Dataset({'foo': ('x', [5, 5, 6, 6])})
        >>> ds.diff('x')
        <xarray.Dataset>
        Dimensions:  (x: 3)
        Coordinates:
          * x        (x) int64 1 2 3
        Data variables:
            foo      (x) int64 0 1 0
        >>> ds.diff('x', 2)
        <xarray.Dataset>
        Dimensions:  (x: 2)
        Coordinates:
        * x        (x) int64 2 3
        Data variables:
        foo      (x) int64 1 -1

        """
        if n == 0:
            return self
        if n < 0:
            raise ValueError('order `n` must be non-negative but got {0}'
                             ''.format(n))

        # prepare slices
        kwargs_start = {dim: slice(None, -1)}
        kwargs_end = {dim: slice(1, None)}

        # prepare new coordinate
        if label == 'upper':
            kwargs_new = kwargs_end
        elif label == 'lower':
            kwargs_new = kwargs_start
        else:
            raise ValueError('The \'label\' argument has to be either '
                             '\'upper\' or \'lower\'')

        variables = OrderedDict()

        for name, var in iteritems(self.variables):
            if dim in var.dims:
                if name in self.data_vars:
                    variables[name] = (var.isel(**kwargs_end) -
                                       var.isel(**kwargs_start))
                else:
                    variables[name] = var.isel(**kwargs_new)
            else:
                variables[name] = var

        difference = self._replace_vars_and_dims(variables)

        if n > 1:
            return difference.diff(dim, n - 1)
        else:
            return difference

    def shift(self, **shifts):
        """Shift this dataset by an offset along one or more dimensions.

        Only data variables are moved; coordinates stay in place. This is
        consistent with the behavior of ``shift`` in pandas.

        Parameters
        ----------
        **shifts : keyword arguments of the form {dim: offset}
            Integer offset to shift along each of the given dimensions.
            Positive offsets shift to the right; negative offsets shift to the
            left.

        Returns
        -------
        shifted : Dataset
            Dataset with the same coordinates and attributes but shifted data
            variables.

        See also
        --------
        roll

        Examples
        --------

        >>> ds = xr.Dataset({'foo': ('x', list('abcde'))})
        >>> ds.shift(x=2)
        <xarray.Dataset>
        Dimensions:  (x: 5)
        Coordinates:
          * x        (x) int64 0 1 2 3 4
        Data variables:
            foo      (x) object nan nan 'a' 'b' 'c'
        """
        invalid = [k for k in shifts if k not in self.dims]
        if invalid:
            raise ValueError("dimensions %r do not exist" % invalid)

        variables = OrderedDict()
        for name, var in iteritems(self.variables):
            if name in self.data_vars:
                var_shifts = dict((k, v) for k, v in shifts.items()
                                  if k in var.dims)
                variables[name] = var.shift(**var_shifts)
            else:
                variables[name] = var

        return self._replace_vars_and_dims(variables)

    def roll(self, **shifts):
        """Roll this dataset by an offset along one or more dimensions.

        Unlike shift, roll rotates all variables, including coordinates. The
        direction of rotation is consistent with :py:func:`numpy.roll`.

        Parameters
        ----------
        **shifts : keyword arguments of the form {dim: offset}
            Integer offset to rotate each of the given dimensions. Positive
            offsets roll to the right; negative offsets roll to the left.

        Returns
        -------
        rolled : Dataset
            Dataset with the same coordinates and attributes but rolled
            variables.

        See also
        --------
        shift

        Examples
        --------

        >>> ds = xr.Dataset({'foo': ('x', list('abcde'))})
        >>> ds.roll(x=2)
        <xarray.Dataset>
        Dimensions:  (x: 5)
        Coordinates:
          * x        (x) int64 3 4 0 1 2
        Data variables:
            foo      (x) object 'd' 'e' 'a' 'b' 'c'
        """
        invalid = [k for k in shifts if k not in self.dims]
        if invalid:
            raise ValueError("dimensions %r do not exist" % invalid)

        variables = OrderedDict()
        for name, var in iteritems(self.variables):
            var_shifts = dict((k, v) for k, v in shifts.items()
                              if k in var.dims)
            variables[name] = var.roll(**var_shifts)

        return self._replace_vars_and_dims(variables)

    @property
    def real(self):
        return self._unary_op(lambda x: x.real, keep_attrs=True)(self)

    @property
    def imag(self):
        return self._unary_op(lambda x: x.imag, keep_attrs=True)(self)


ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False)