Source code for typhon.math.stats

"""Various statistical functions

"""

# Any commits made to this module between 2015-05-01 and 2017-03-01
# by Gerrit Holl are developed for the EC project “Fidelity and
# Uncertainty in Climate Data Records from Earth Observations (FIDUCEO)”.
# Grant agreement: 638822
# 
# All those contributions are dual-licensed under the MIT license for use
# in typhon, and the GNU General Public License version 3.

import numbers

import numpy

import scipy.stats
import scipy.special


[docs]def bin(x, y, bins):
    """Bin/bucket y according to values of x.

    Returns list of arrays, one element with values for bin.
    Digitising happens with numpy.digitize.

    You can use this for geographical data, but if your x and
    y are in lat/lons the bins will be equirectangular (if you
    want equal-area bins you will have to reproject / change
    coordinates).

    Arguments:

        x (ndarray): Coordinate that y is binned along
        y (ndarray):  Data to be binned.  First dimension must match
            length of x.  All subsequent dimensions are left untouched.
        bins (ndarray): Bins according to which sort data.

    Returns:

        List of arrays, one element per bin.
    """
    if x.size == y.size == 0:
        return [y[()] for b in bins]
    digits = numpy.digitize(x, bins)
    binned = [y[digits == i, ...] for i in range(len(bins))]
    return binned


[docs]def bin_nd(binners, bins, data=None):
    """Bin/bucket data in arbitrary number of dimensions

    For example, one can bin geographical data according to lat/lon
    through:

    >>> binned = bin_nd([lats, lons], [lat_bins, lon_bins])

    The actually binned data are the indices for the arrays lats/lons,
    which hopefully corresponds to indices in your actual data.

    Data that does not fit in any bin, is not binned anywhere.

    You can use this for geographical data, but if your x and
    y are in lat/lons the bins will be equirectangular (if you
    want equal-area bins you will have to reproject / change
    coordinates).

    Note: do NOT pass the 3rd argument, `data`.  This is used purely for
    the implementation using recursion.  Passing anything here explicitly
    is a recipe for disaster.

    Arguments:

        binners (List[ndarray]): Axes that data is binned at.  This is
            akin to the x-coordinate in `:func:bin`.

        bins (List[ndarray]): Edges for the bins according to which bin
            data.

    Returns:
        n-D ndarray of type `object`, with indices describing what bin
        elements belong to.
    """

    if len(bins) != len(binners):
        raise ValueError("Length of bins must equal length of binners. "
                         "Found {} bins, {} binners.".format(
                             len(bins), len(binners)))

    for b in bins:
        if b.ndim != 1:
            raise ValueError("Bin-array must be 1-D. "
                             "Found {}-D array.".format(b.ndim))

    if not all([b.size == binners[0].size for b in binners[1:]]):
        raise ValueError("All binners must have same length.")

    dims = numpy.array([b.size for b in bins])

    nd = len(binners)

    if nd == 0:
        return numpy.array([], dtype=numpy.uint64)

    indices = numpy.arange(binners[0].size, dtype=numpy.uint64)
    if data is None:
        data = indices

    if nd > 1:
        # innerbinned = bin(binners[-1], data, bins[-1])
        innerbinned = bin(binners[-1], indices, bins[-1])
        outerbinned = []
        for (i, ib) in enumerate(innerbinned):
            obinners = [x[ib] for x in binners[:-1]]
            ob = bin_nd(obinners, bins[:-1], data[ib])
            outerbinned.append(ob)

        # go through some effort to make sure v[i, j, ...] is always
        # numpy.uint64, whereas v is numpy.object_
        # do this in steps, see comment in the else:-block below for
        # reasoning
        #
        # We have outerbinned, which has length n_N, and contains ndarrays
        # of size n_1 * n_2 * ... * n_{N-1}.
        #
        # We want V to be n_1 * n_2 * ... * n_N, where N is the number of
        # dimensions we are binning.
        #
        # The following could /probably/ be do with some sophisticated
        # list comprehension and permutation, but this is clearer.

        V = numpy.empty(shape=dims, dtype=numpy.object_)
        for i in range(len(outerbinned)):
            V[..., i] = outerbinned[i]

#        V.T[...] = [
#            [numpy.array(e.tolist(), dtype=numpy.uint64)
#                for e in l] for l in outerbinned]
        return V
        # return numpy.array(v, dtype=numpy.object_)
    else:
        # NB: I should not convert a list-of-ndarrays to an object-ndarray
        # directly.  If all nd-arrays have the same dimensions (such as
        # size x=0), the converted nd-array will have x as an additional
        # dimension, rather than having object arrays inside the
        # container.  To prevent this, explicitly initialise the ndarray.
        binned = bin(binners[0], data, bins[0])
        B = numpy.empty(shape=len(binned), dtype=numpy.object_)
        B[:] = binned
        return B


def binned_statistic(coords, data, bins, statistic=None):
    """Bin data and calculate statistics on them

    As :func:`scipy.stats.binned_statistic` but faster for simple
    statistics.

    Args:
        coords: 1-dimensional numpy.array with the coordinates that should be
            binned.
        data: A numpy.array on which the statistic should be applied.
        bins: The bins used for the binning.
        statistic: So far only *mean* is implemented.

    Returns:
        A numpy array with statistic results.
    """
    bin_indices = numpy.digitize(coords, bins)
    data_sum = numpy.bincount(bin_indices, weights=data, minlength=bins.size)
    data_number = numpy.bincount(bin_indices, minlength=bins.size)
    return data_sum / data_number

[docs]def get_distribution_as_percentiles(x, y,
                                    bins,
                                    ptiles=(5, 25, 5, 75, 95)):
    """get the distribution of y vs. x as percentiles.

    Bin y-data according to x-data (using :func:`typhon.math.stats.bin`).
    Then, within each bin, calculate percentiles.

    Arguments:

        x (ndarray): data for x-axis
        y (ndarray): data for y-axis
        bins (ndarray): Specific bins to use for dividing the x-data.
        ptiles (ndarray): Percentiles to use.
    """

    # explicitly get rid of masked data, because scoreatpercentile is not
    # masked-array aware
    try:
        good = (~x.mask) & (~y.mask)
    except AttributeError: # not a masked-array, leave as-is
        pass
    else: # surely masked arrays
        x = x[good].data
        y = y[good].data
    ybinned = bin(x, y, bins)
    return numpy.vstack([scipy.stats.scoreatpercentile(b, ptiles)
                         for b in ybinned])


[docs]def adev(x, dim=-1):
    r"""Calculate Allan deviation in its simplest form

    Arguments:

        x (ndarray or xarray DataArray): n-dim array for Allan calculation
        dim (int or str): optional, axis to operate along, defaults to
            last.  If you pass a str, x must be a xarray.DataArray and the
            dimension will be a name.

    .. math::
        \sigma = \sqrt{\frac{1}{2(N-1)} \sum_{i=1}^{N-1} (y_{i+1} - y_i)^2}

    Equation source: Jon Mittaz, personal communication, April 2016
    """

    
    if isinstance(dim, numbers.Integral):
        # dimension by number, probably ndarray
        x = x.swapaxes(-1, dim)
        N = x.shape[-1]
        return numpy.sqrt(1/(2*(N-1)) *
                          ((x[..., 1:] - x[..., :-1])**2).sum(-1))
    else:
        # dimension by name, should be xarray.Dataarray
        N = x.sizes[dim]
        return numpy.sqrt((1/(2*(N-1)) * x.diff(dim=dim)**2).sum(dim=dim))

[docs]def corrcoef(mat):
    """Calculate correlation coefficient with p-values

    Calculate correlation coefficients along with p-values.

    Arguments:

        mat (ndarray): 2-D array [p×N] for which the correlation matrix is
            calculated

    Returns:

        (r, p) where r is a p×p matrix with the correlation coefficients,
        obtained with numpy.corrcoef, and p is 

    Attribution:
    
        this code, or an earlier version was posted by user 'jingchao' on
        Stack Overflow at 2014-7-3 at
        http://stackoverflow.com/a/24547964/974555 and is licensed under
        CC BY-SA 3.0.  This notice may not be removed.
    """
    r = numpy.corrcoef(mat)
    rf = r[numpy.triu_indices(r.shape[0], 1)]
    df = mat.shape[1] - 2
    ts = rf * rf * (df / (1 - rf * rf))
    pf = scipy.special.betainc(0.5 * df, 0.5, df / (df + ts))
    p = numpy.zeros(shape=r.shape)
    p[numpy.triu_indices(p.shape[0], 1)] = pf
    p[numpy.tril_indices(p.shape[0], -1)] = pf
    p[numpy.diag_indices(p.shape[0])] = numpy.ones(p.shape[0])
    return (r, p)