Source code for typhon.math.array

"""Functions operating on arrays."""

# Any commits made to this module between 2015-05-01 and 2017-03-01
# by Gerrit Holl are developed for the EC project “Fidelity and
# Uncertainty in Climate Data Records from Earth Observations (FIDUCEO)”.
# Grant agreement: 638822
#
# All those contributions are dual-licensed under the MIT license for use
# in typhon, and the GNU General Public License version 3.

import numpy as np
import scipy.stats


[docs]def localmin(arr):
    """Find local minima for 1-D array

    Given a 1-dimensional numpy.ndarray, return the locations of any local
    minimum as a boolean array.  The first and last item are always
    considered False.

    Arguments:

        localmin (numpy.ndarray): 1-D ndarray for which to find local
            minima.  Should have a numeric dtype.

    Returns:

        numpy.ndarray with dtype `bool`.  True for any element that is
        strictly smaller than both neighbouring elements.  First and last
        element are always False.
    """

    localmin = np.hstack(
        (False, (arr[1:-1] < arr[0:-2]) & (arr[1:-1] < arr[2:]), False)
    )

    return localmin


[docs]def limit_ndarray(M, limits):
    """Select elements from structured ndarray based on value ranges

    This function filters a structured ndarray based on ranges defined for
    zero or more fields.  For each field f with limits (lo, hi), it will
    select only those elements where lo<=X[f]<hi.

    >>> X = array([(2, 3), (4, 5), (8, 2), (5, 1)],
                   dtype=[("A", "i4"), ("B", "i4")])

    >>> print(limit_ndarray(X, {"A": (2, 5)}))
    [(2, 3) (4, 5)]

    >>> X = array([([2, 3], 3), ([4, 6], 5), ([8, 3], 2), ([5, 3], 1)],
                   dtype=[("A", "i4", 2), ("B", "i4")])

    >>> print(limit_ndarray(X, {"A": (2, 5, "all")}))
    [([2, 3], 3)]

    Arguments:

        M (numpy.ndarray): 1-D structured ndarray

        limits (dict): Dictionary with limits.  Keys must correspond to
            fields in M.  If this is a scalar field
            (`M.dtype[field].shape==()`), values are tuples (lo, hi).
            If this is a multidimensional field, values are tuples (lo,
            hi, mode), where mode must be either `all` or `any`.
            Values in the range [lo, hi) are retained, applying all or any
            when needed.

    Returns:

        ndarray subset of M.  This is a view, not a copy.
    """

    selection = np.ones(shape=M.shape, dtype="?")

    for (field, val) in limits.items():
        ndim = len(M.dtype[field].shape)
        if ndim == 0:
            (lo, hi) = val
            selection = selection & (M[field] >= lo) & (M[field] < hi)
        else:
            (lo, hi, mode) = val
            lelo = M[field] >= lo
            sthi = M[field] < hi
            while lelo.ndim > 1:
                lelo = getattr(lelo, mode)(-1)
                sthi = getattr(sthi, mode)(-1)
            selection = selection & lelo & sthi

    return M[selection]


[docs]def parity(v):
    """Vectorised parity-checking.

    For any ndarray with an nd.integer dtype, return an equally shaped
    array with the bit parity for each element.

    Arguments:

        v (numpy.ndarray): Array of integer dtype

    Returns:

        ndarray with uint8 dtype with the parity for each value in v
    """

    v = v.copy()  # don't ruin original
    parity = np.zeros(dtype=">u1", shape=v.shape)
    while v.any():
        parity[v != 0] += 1
        v &= v - 1
    return parity


[docs]def mad_outliers(arr, cutoff=10, mad0="raise"):
    """Mask out mad outliers

    Mask out any values that are more than N times the median absolute
    devitation from the median.

    Although I (Gerrit Holl) came up with this myself, it's also
    documented at:

    http://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/

    except that I rolled by own approach for "what if mad==0".

    Note: If all values except one are constant, it is not possible to
    determine whether the remaining one is an outlier or “reasonably
    close” to the rest, without additional hints.  In this case, some
    outliers may go unnoticed.

    Arguments:

        arr (numpy.ndarray): n-D array with numeric dtype

        cutoff (int): Maximum tolerable normalised fractional distance

        mad0 (str): What to do if mad=0.  Can be 'raise', 'ignore', or
            'perc'.  In case of 'perc', will search for the lowest
            percentile at which the percentile absolute deviation is
            nonzero, increase the cutoff by the fractional approach toward
            percentile 100, and use that percentile instead.  So if the
            first non-zero is at percentile 75%, it will use the
            75th-percntile-absolute-deviation and increase the cutoff by
            a factor (100 - 50)/(100 - 75).

    Returns:

        ndarray with bool dtype, True for outliers
    """

    if arr.ptp() == 0:
        return np.zeros(shape=arr.shape, dtype="?")

    ad = abs(arr - np.ma.median(arr))
    mad = np.ma.median(ad)
    if mad == 0:
        if mad0 == "raise":
            raise ValueError("Cannot filter outliers, MAD=0")
        elif mad0 == "perc":
            # try other percentiles
            perc = np.r_[np.arange(50, 99, 1), np.linspace(99, 100, 100)]
            pad = scipy.stats.scoreatpercentile(ad, perc)
            if (pad == 0).all():  # all constant…?
                raise ValueError("These data are weird!")
            p_i = pad.nonzero()[0][0]
            cutoff *= (100 - 50) / (100 - perc[p_i])
            return (ad / pad[p_i]) > cutoff
    elif mad is np.ma.masked:
        # all are masked already…
        return np.ones(shape=ad.shape, dtype="?")
    else:
        return (ad / mad) > cutoff


[docs]def argclosest(array, value, retvalue=False):
    """Returns the index of the closest value in array.

    Parameters:
        array (ndarray): Input array.
        value (float): Value to compare to.
        retvalue (bool): If True, return the index and the closest value.

    Returns:
        int, float:
        Index of closest value, Closest value (if ``retvalue`` is True)

    """
    idx = np.abs(np.asarray(array) - value).argmin()

    return (idx, array[idx]) if retvalue else idx