#!/usr/bin/env python
#
# stats.py
"""
Functions for calculating statistics.
"""
#
# Copyright © 2014-2020 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
# _contains_nan, median_absolute_deviation, absolute_deviation and
# absolute_deviation_from_median adapted from SciPy
# Copyright (c) 2001-2002 Enthought, Inc. 2003-2019, SciPy Developers
# Available under the BSD License
#
# pooled_sd, g_hedge and g_durlak_bias based on formulae from
# https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
#
# interpret_d based on Sullivan, G. and Feinn, R. (2012). Using Effect Size—or
# Why the P Value Is Not Enough. Journal of Graduate Medical Education,
# 4(3), pp.279-282.
# and https://www.psychometrica.de/effect_size.html#transform
#
# d_cohen based on Cohen, J. (1988). Statistical power analysis for the
# behavioral sciences (2nd Edition). Hillsdale, NJ: Lawrence Erlbaum Associates
#
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
# stdlib
import warnings
from typing import Callable, Optional, Sequence, Union, cast, overload
# 3rd party
import numpy
from typing_extensions import Literal
# this package
from mathematical import utils
__all__ = [
"mean_none",
"std_none",
"median_none",
"iqr_none",
"percentile_none",
"pooled_sd",
"d_cohen",
"g_hedge",
"g_durlak_bias",
"interpret_d",
"median_absolute_deviation",
"absolute_deviation",
"absolute_deviation_from_median",
"within1min"
]
#: Type hint for allowed values for ``nan_values``.
NaNPolicies = Literal["propagate", "raise", "omit"]
[docs]def mean_none(dataset: Sequence[Union[float, bool, None]]) -> float:
"""
Calculate the mean, excluding NaN, strings, boolean values, and zeros.
:param dataset: list to calculate mean from
:return: mean
"""
dataset = utils.strip_none_bool_string(dataset)
dataset = utils.remove_zero(dataset)
return float(numpy.nanmean(dataset))
[docs]def std_none(dataset: Sequence[Union[float, bool, None]], ddof: int = 1) -> float:
"""
Calculate the standard deviation, excluding NaN, strings, boolean values, and zeros.
:param dataset: list to calculate mean from.
:param ddof: Means Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``,
where ``N`` represents the number of elements.
:return: standard deviation
"""
dataset = utils.strip_none_bool_string(dataset)
dataset = utils.remove_zero(dataset)
# print(dataset)
return float(numpy.nanstd(dataset, ddof=ddof))
[docs]def iqr_none(dataset: Sequence[Union[float, bool, None]]) -> float:
"""
Calculate the interquartile range, excluding NaN, strings, boolean values, and zeros.
:param dataset: A list to calculate iqr from.
:return: The interquartile range.
"""
q1 = percentile_none(dataset, 25)
q3 = percentile_none(dataset, 75)
iq = q3 - q1
return float(iq)
[docs]def percentile_none(dataset: Sequence[Union[float, bool, None]], percentage: float) -> float:
"""
Calculate the given percentile, excluding NaN, strings, boolean values, and zeros.
:param dataset: Sequence to calculate the percentile from.
:param percentage:
:raises: :exc:`ValueError` if ``dataset`` contains fewer than two values
:return: The interquartile range.
"""
dataset = utils.strip_none_bool_string(dataset)
dataset = utils.remove_zero(dataset)
dataset = [x for x in dataset if not numpy.isnan(x)]
if len(dataset) < 2:
raise ValueError("Dataset too small")
return float(numpy.percentile(dataset, percentage))
[docs]def pooled_sd(sample1: Sequence[float], sample2: Sequence[float], weighted: bool = False) -> float:
"""
Returns the pooled standard deviation.
:param sample1: datapoints for first sample
:param sample2: datapoints for second sample
:param weighted: True for weighted pooled SD
.. seealso:: https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
"""
sd1 = numpy.std(sample1)
sd2 = numpy.std(sample2)
n1 = len(sample1)
n2 = len(sample2)
if weighted:
return numpy.sqrt((((n1 - 1) * (sd1**2)) + ((n2 - 1) * (sd2**2))) / (n1 + n2 - 2))
else:
return numpy.sqrt(((sd1**2) + (sd2**2)) / 2)
[docs]def d_cohen(
sample1: Sequence[float],
sample2: Sequence[float],
which: Literal[1, 2] = 1,
tail: Literal[1, 2] = 1,
pooled: bool = False,
) -> float:
"""
Calculates and returns Cohen's effect size index **d**.
.. seealso::
Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Edition).
Hillsdale, NJ: Lawrence Erlbaum Associates
:param sample1: datapoints for first sample
:param sample2: datapoints for second sample
:param which: Use the standard deviation of the first sample (``1``) or the second sample (``2``)
:param tail: The number of tails to consider.
:param pooled: Whether to use the pooled standard deviation.
"""
mean1: float = cast(float, numpy.mean(sample1))
mean2: float = cast(float, numpy.mean(sample2))
stdev: float
if which == 1:
stdev = cast(float, numpy.std(sample1))
else:
stdev = cast(float, numpy.std(sample2))
if pooled:
stdev = pooled_sd(sample1, sample2)
if tail == 2:
return numpy.abs(mean1 - mean2) / stdev
return (mean1 - mean2) / stdev
[docs]def g_hedge(sample1: Sequence[float], sample2: Sequence[float]) -> float:
"""
Calculates and returns Hedge's g-Statistic.
Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
:param sample1: datapoints for first sample
:param sample2: datapoints for second sample
"""
mean1: float = cast(float, numpy.mean(sample1))
mean2: float = cast(float, numpy.mean(sample2))
return (mean1 - mean2) / pooled_sd(sample1, sample2, True)
[docs]def g_durlak_bias(g: float, n: float) -> float:
"""
Application of Durlak's bias correction to the Hedge's g statistic.
n = n1+n2
:param g: Hedge's g-Statistic, calculated using :func:`~.g_hedge`.
:param n: The total number of samples in both datasets.
.. seealso:: https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
"""
durlak = ((n - 3) / (n - 2.25)) * numpy.sqrt((n - 2) / n)
return g * durlak
[docs]def interpret_d(d_or_g: float) -> str:
"""
Interpret Cohen's d or Hedge's g values using Table 1
from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3444174/
:param d_or_g:
:type d_or_g:
""" # noqa: D400
if 0.0 <= d_or_g < 0.2:
return "No Effect"
elif 0.2 <= d_or_g < 0.5:
return "Small Effect"
elif 0.5 <= d_or_g < 0.8:
return "Intermediate Effect"
elif 0.8 <= d_or_g:
return "Large Effect"
else: # d_or_g < 0
return f"{interpret_d(numpy.abs(d_or_g)).split(' ')[0]} Adverse Effect"
def _contains_nan(a, nan_policy: NaNPolicies = "propagate"):
policies = ["propagate", "raise", "omit"]
if nan_policy not in policies:
raise ValueError("nan_policy must be one of {%s}" % ", ".join(f"'{s}'" for s in policies))
try:
# Calling numpy.sum to avoid creating a huge array into memory
# e.g. numpy.isnan(a).any()
with numpy.errstate(invalid="ignore"):
contains_nan = numpy.isnan(numpy.sum(a))
except TypeError:
# This can happen when attempting to sum things which are not
# numbers (e.g. as in the function `mode`). Try an alternative method:
try:
contains_nan = numpy.nan in set(a.ravel())
except TypeError:
# Don't know what to do. Fall back to omitting nan values and
# issue a warning.
contains_nan = False
nan_policy = "omit"
warnings.warn(
"The input array could not be properly checked for nan "
"values. nan values will be ignored.",
RuntimeWarning
)
if contains_nan and nan_policy == "raise":
raise ValueError("The input contains nan values")
return contains_nan, nan_policy
@overload
def median_absolute_deviation(
x,
axis: None,
center: Callable = ...,
scale: float = ...,
nan_policy: NaNPolicies = ...,
) -> float: ...
@overload
def median_absolute_deviation(
x,
axis: int = ...,
center: Callable = ...,
scale: float = ...,
nan_policy: NaNPolicies = ...,
) -> numpy.ndarray: ...
@overload
def absolute_deviation(
x,
axis: None,
center: Callable = ...,
nan_policy: NaNPolicies = ...,
) -> float: ...
@overload
def absolute_deviation(
x,
axis: int = ...,
center: Callable = ...,
nan_policy: NaNPolicies = ...,
) -> numpy.ndarray: ...
[docs]def absolute_deviation(
x,
axis: Optional[int] = 0,
center: Callable = numpy.median,
nan_policy: NaNPolicies = "propagate",
) -> Union[float, numpy.ndarray]:
"""
Compute the absolute deviations from the median of the data along the given axis.
:param x: Input array or object that can be converted to an array.
:type x: array_like
:param axis: Axis along which the range is computed. If None, compute
the MAD over the entire array.
:param center: A function that will return the central value. The default is to use
numpy.median. Any user defined function used will need to have the function
signature ``func(arr, axis)``.
:param nan_policy: Defines how to handle when input contains nan. 'propagate'
returns nan, 'raise' throws an error, 'omit' performs the
calculations ignoring nan values.
:returns: If ``axis=None``, a scalar is returned. If the input contains
integers or floats of smaller precision than :class:`numpy.float64`, then the
output data-type is :class:`numpy.float64`. Otherwise, the output data-type is
the same as that of the input.
:rtype: scalar or ndarray
.. note::
The `center` argument only affects the calculation of the central value
around which the MAD is calculated. That is, passing in ``center=numpy.mean``
will calculate the MAD around the mean - it will not calculate the *mean*
absolute deviation.
"""
x = numpy.asarray(x)
# Consistent with `numpy.var` and `numpy.std`.
if not x.size:
return numpy.nan
contains_nan, nan_policy = _contains_nan(x, nan_policy)
if contains_nan and nan_policy == "propagate":
return numpy.nan
if contains_nan and nan_policy == "omit":
# Way faster than carrying the masks around
arr = numpy.ma.masked_invalid(x).compressed()
else:
arr = x
if axis is None:
med = center(arr)
ad = numpy.abs(arr - med)
else:
med = numpy.apply_over_axes(center, arr, axis)
ad = numpy.abs(arr - med)
return ad
@overload
def absolute_deviation_from_median(
x,
axis: None,
center: Callable = ...,
nan_policy: NaNPolicies = ...,
) -> float: ...
@overload
def absolute_deviation_from_median(
x,
axis: int = ...,
center: Callable = ...,
nan_policy: NaNPolicies = ...,
) -> numpy.ndarray: ...
[docs]def within1min(value1: float, value2: float) -> bool:
"""
Returns whether ``value2`` is within one minute of ``value1``.
:param value1: A time in minutes.
:param value2: Another time in minutes.
"""
if value1 and value2:
return (float(value1) - 1) < (float(value2)) < (float(value1) + 1)
else:
return False