Source code for latools.helpers.stat_fns

"""
Functions for calculating statistics and handling uncertainties.

(c) Oscar Branson : https://github.com/oscarbranson
"""

import numpy as np
import uncertainties.unumpy as un
import scipy.interpolate as interp
from scipy.stats import pearsonr

[docs]def nan_pearsonr(x, y):
    xy = np.vstack([x, y])
    xy = xy[:, ~np.any(np.isnan(xy),0)]
    n = len(x)
    if xy.shape[-1] < n // 2:
        return np.nan, np.nan
        
    return pearsonr(xy[0], xy[1])

[docs]def R2calc(meas, model, force_zero=False):
    if force_zero:
        SStot = np.sum(meas**2)
    else:
        SStot = np.sum((meas - np.nanmean(meas))**2)
    SSres = np.sum((meas - model)**2)
    return 1 - (SSres / SStot)


# uncertainties unpackers
[docs]def unpack_uncertainties(uarray):
    """
    Convenience function to unpack nominal values and uncertainties from an
    ``uncertainties.uarray``.

    Returns:
        (nominal_values, std_devs)
    """
    try:
        return un.nominal_values(uarray), un.std_devs(uarray)
    except:
        return uarray, None


[docs]def nominal_values(a):
    try:
        return un.nominal_values(a)
    except:
        return a


[docs]def std_devs(a):
    try:
        return un.std_devs(a)
    except:
        return a

[docs]def gauss_weighted_stats(x, yarray, x_new, fwhm):
    """
    Calculate gaussian weigted moving mean, SD and SE.

    Parameters
    ----------
    x : array-like
        The independent variable
    yarray : (n,m) array
        Where n = x.size, and m is the number of
        dependent variables to smooth.
    x_new : array-like
        The new x-scale to interpolate the data
    fwhm : int
        FWHM of the gaussian kernel.

    Returns
    -------
    (mean, std, se) : tuple
    """
    sigma = fwhm / (2 * np.sqrt(2 * np.log(2)))

    # create empty mask array
    mask = np.zeros((x.size, yarray.shape[1], x_new.size))
    # fill mask
    for i, xni in enumerate(x_new):
        mask[:, :, i] = gauss(x[:, np.newaxis], 1, xni, sigma)
    # normalise mask
    nmask = mask / mask.sum(0)  # sum of each gaussian = 1

    # calculate moving average
    av = (nmask * yarray[:, :, np.newaxis]).sum(0)  # apply mask to data
    # sum along xn axis to get means

    # calculate moving sd
    diff = np.power(av - yarray[:, :, np.newaxis], 2)
    std = np.sqrt((diff * nmask).sum(0))
    # sqrt of weighted average of data-mean

    # calculate moving se
    se = std / np.sqrt(mask.sum(0))
    # max amplitude of weights is 1, so sum of weights scales
    # a fn of how many points are nearby. Use this as 'n' in
    # SE calculation.

    return av, std, se


[docs]def gauss(x, *p):
    """ Gaussian function.

    Parameters
    ----------
    x : array_like
        Independent variable.
    *p : parameters unpacked to A, mu, sigma
        A = amplitude, mu = centre, sigma = width

    Return
    ------
    array_like
        gaussian descriped by *p.
    """
    A, mu, sigma = p
    return A * np.exp(-0.5 * (-mu + x)**2 / sigma**2)


# Statistical Functions
[docs]def stderr(a):
    """
    Calculate the standard error of a.
    """
    return np.nanstd(a) / np.sqrt(sum(np.isfinite(a)))


# Robust Statistics. See:
#   - https://en.wikipedia.org/wiki/Robust_statistics
#   - http://www.cscjp.co.jp/fera/document/ANALYSTVol114Decpgs1693-97_1989.pdf
#   - http://www.rsc.org/images/robust-statistics-technical-brief-6_tcm18-214850.pdf
#   - http://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/h15.htm

[docs]def H15_mean(x):
    """
    Calculate the Huber (H15) Robust mean of x.

    For details, see:
        http://www.cscjp.co.jp/fera/document/ANALYSTVol114Decpgs1693-97_1989.pdf
        http://www.rsc.org/images/robust-statistics-technical-brief-6_tcm18-214850.pdf
    """
    mu = np.nanmean(x)
    sd = np.nanstd(x) * 1.134
    sig = 1.5

    hi = x > mu + sig * sd
    lo = x < mu - sig * sd

    if any(hi | lo):
        x[hi] = mu + sig * sd
        x[lo] = mu - sig * sd
        return H15_mean(x)
    else:
        return mu


[docs]def H15_std(x):
    """
    Calculate the Huber (H15) Robust standard deviation of x.

    For details, see:
        http://www.cscjp.co.jp/fera/document/ANALYSTVol114Decpgs1693-97_1989.pdf
        http://www.rsc.org/images/robust-statistics-technical-brief-6_tcm18-214850.pdf
    """
    mu = np.nanmean(x)
    sd = np.nanstd(x) * 1.134
    sig = 1.5

    hi = x > mu + sig * sd
    lo = x < mu - sig * sd

    if any(hi | lo):
        x[hi] = mu + sig * sd
        x[lo] = mu - sig * sd
        return H15_std(x)
    else:
        return sd


[docs]def H15_se(x):
    """
    Calculate the Huber (H15) Robust standard deviation of x.

    For details, see:
        http://www.cscjp.co.jp/fera/document/ANALYSTVol114Decpgs1693-97_1989.pdf
        http://www.rsc.org/images/robust-statistics-technical-brief-6_tcm18-214850.pdf
    """
    sd = H15_std(x)
    return sd / np.sqrt(sum(np.isfinite(x)))

[docs]def get_total_n_points(d):
    """
    Returns the total number of data points in values of dict.

    Paramters
    ---------
    d : dict
    """
    n = 0
    for di in d.values():
        n += len(di)
    return n

[docs]def get_total_time_span(d):
    """
    Returns total length of analysis.
    """

    tmax = 0
    for di in d.values():
        if di.uTime.max() > tmax:
            tmax = di.uTime.max()
    
    return tmax

[docs]class un_interp1d(object):
    """
    object for handling interpolation of values with uncertainties.
    """

    def __init__(self, x, y, fill_value=np.nan, **kwargs):
        if isinstance(fill_value, tuple):
            nom_fill = tuple([un.nominal_values(v) for v in fill_value])
            std_fill = tuple([un.std_devs(v) for v in fill_value])
        else:
            nom_fill = std_fill = fill_value
        self.nom_interp = interp.interp1d(un.nominal_values(x),
                                          un.nominal_values(y),
                                          fill_value=nom_fill, **kwargs)
        self.std_interp = interp.interp1d(un.nominal_values(x),
                                          un.std_devs(y),
                                          fill_value=std_fill, **kwargs)

[docs]    def new(self, xn):
        yn = self.nom_interp(xn)
        yn_err = self.std_interp(xn)
        return un.uarray(yn, yn_err)

[docs]    def new_nom(self, xn):
        return self.nom_interp(xn)

[docs]    def new_std(self, xn):
        return self.std_interp(xn)

[docs]def stack_keys(ddict, keys, extra=None):
    """
    Combine elements of ddict into an array of shape (len(ddict[key]), len(keys)).

    Useful for preparing data for sklearn.

    Parameters
    ----------
    ddict : dict
        A dict containing arrays or lists to be stacked.
        Must be of equal length.
    keys : list or str
        The keys of dict to stack. Must be present in ddict.
    extra : list (optional)
        A list of additional arrays to stack. Elements of extra
        must be the same length as arrays in ddict.
        Extras are inserted as the first columns of output.
    """
    if isinstance(keys, str):
        d = [ddict[keys]]
    else:
        d = [ddict[k] for k in keys]
    if extra is not None:
        d = extra + d
    return np.vstack(d).T