Source code for mathematical.stats

#!/usr/bin/env python
#
#  stats.py
"""
Functions for calculating statistics.
"""
#
#  Copyright © 2014-2020 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  _contains_nan, median_absolute_deviation, absolute_deviation and
#  absolute_deviation_from_median adapted from SciPy
#  Copyright (c) 2001-2002 Enthought, Inc.  2003-2019, SciPy Developers
#  Available under the BSD License
#
#  pooled_sd, g_hedge and g_durlak_bias based on formulae from
#  https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
#
#  interpret_d based on Sullivan, G. and Feinn, R. (2012). Using Effect Size—or
#  		Why the P Value Is Not Enough. Journal of Graduate Medical Education,
#  		4(3), pp.279-282.
# 	and https://www.psychometrica.de/effect_size.html#transform
#
#  d_cohen based on Cohen, J. (1988). Statistical power analysis for the
#  		behavioral sciences (2nd Edition). Hillsdale, NJ: Lawrence Erlbaum Associates
#
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#

# stdlib
import warnings
from typing import Callable, Optional, Sequence, Union, cast, overload

# 3rd party
import numpy
from typing_extensions import Literal

# this package
from mathematical import utils

__all__ = [
		"mean_none",
		"std_none",
		"median_none",
		"iqr_none",
		"percentile_none",
		"pooled_sd",
		"d_cohen",
		"g_hedge",
		"g_durlak_bias",
		"interpret_d",
		"median_absolute_deviation",
		"absolute_deviation",
		"absolute_deviation_from_median",
		"within1min"
		]

#: Type hint for allowed values for ``nan_values``.
NaNPolicies = Literal["propagate", "raise", "omit"]


[docs]def mean_none(dataset: Sequence[Union[float, bool, None]]) -> float:
	"""
	Calculate the mean, excluding NaN, strings, boolean values, and zeros.

	:param dataset: list to calculate mean from

	:return: mean
	"""

	dataset = utils.strip_none_bool_string(dataset)
	dataset = utils.remove_zero(dataset)

	return float(numpy.nanmean(dataset))


[docs]def std_none(dataset: Sequence[Union[float, bool, None]], ddof: int = 1) -> float:
	"""
	Calculate the standard deviation, excluding NaN, strings, boolean values, and zeros.

	:param dataset: list to calculate mean from.
	:param ddof: Means Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``,
		where ``N`` represents the number of elements.

	:return: standard deviation
	"""

	dataset = utils.strip_none_bool_string(dataset)
	dataset = utils.remove_zero(dataset)
	# print(dataset)

	return float(numpy.nanstd(dataset, ddof=ddof))


[docs]def median_none(dataset: Sequence[Union[float, bool, None]]) -> float:
	"""
	Calculate the median, excluding NaN, strings, boolean values, and zeros.

	:param dataset: list to calculate median from

	:return: standard deviation
	"""

	dataset = utils.strip_none_bool_string(dataset)
	dataset = utils.remove_zero(dataset)

	return float(numpy.nanmedian(dataset))


[docs]def iqr_none(dataset: Sequence[Union[float, bool, None]]) -> float:
	"""
	Calculate the interquartile range, excluding NaN, strings, boolean values, and zeros.

	:param dataset: A list to calculate iqr from.

	:return: The interquartile range.
	"""

	q1 = percentile_none(dataset, 25)
	q3 = percentile_none(dataset, 75)
	iq = q3 - q1

	return float(iq)


[docs]def percentile_none(dataset: Sequence[Union[float, bool, None]], percentage: float) -> float:
	"""
	Calculate the given percentile, excluding NaN, strings, boolean values, and zeros.

	:param dataset: Sequence to calculate the percentile from.
	:param percentage:

	:raises: :exc:`ValueError` if ``dataset`` contains fewer than two values

	:return: The interquartile range.
	"""

	dataset = utils.strip_none_bool_string(dataset)
	dataset = utils.remove_zero(dataset)
	dataset = [x for x in dataset if not numpy.isnan(x)]

	if len(dataset) < 2:
		raise ValueError("Dataset too small")

	return float(numpy.percentile(dataset, percentage))


[docs]def pooled_sd(sample1: Sequence[float], sample2: Sequence[float], weighted: bool = False) -> float:
	"""
	Returns the pooled standard deviation.

	:param sample1: datapoints for first sample
	:param sample2: datapoints for second sample
	:param weighted: True for weighted pooled SD

	.. seealso:: https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
	"""

	sd1 = numpy.std(sample1)
	sd2 = numpy.std(sample2)
	n1 = len(sample1)
	n2 = len(sample2)

	if weighted:
		return numpy.sqrt((((n1 - 1) * (sd1**2)) + ((n2 - 1) * (sd2**2))) / (n1 + n2 - 2))
	else:
		return numpy.sqrt(((sd1**2) + (sd2**2)) / 2)


[docs]def d_cohen(
		sample1: Sequence[float],
		sample2: Sequence[float],
		which: Literal[1, 2] = 1,
		tail: Literal[1, 2] = 1,
		pooled: bool = False,
		) -> float:
	"""
	Calculates and returns Cohen's effect size index **d**.

	.. seealso::

		Cohen, J. (1988). Statistical power analysis for the behavioral sciences (2nd Edition).
		Hillsdale, NJ: Lawrence Erlbaum Associates

	:param sample1: datapoints for first sample
	:param sample2: datapoints for second sample
	:param which: Use the standard deviation of the first sample (``1``) or the second sample (``2``)
	:param tail: The number of tails to consider.
	:param pooled: Whether to use the pooled standard deviation.
	"""

	mean1: float = cast(float, numpy.mean(sample1))
	mean2: float = cast(float, numpy.mean(sample2))
	stdev: float

	if which == 1:
		stdev = cast(float, numpy.std(sample1))
	else:
		stdev = cast(float, numpy.std(sample2))

	if pooled:
		stdev = pooled_sd(sample1, sample2)

	if tail == 2:
		return numpy.abs(mean1 - mean2) / stdev

	return (mean1 - mean2) / stdev


[docs]def g_hedge(sample1: Sequence[float], sample2: Sequence[float]) -> float:
	"""
	Calculates and returns Hedge's g-Statistic.

	Formula from https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm

	:param sample1: datapoints for first sample
	:param sample2: datapoints for second sample
	"""

	mean1: float = cast(float, numpy.mean(sample1))
	mean2: float = cast(float, numpy.mean(sample2))
	return (mean1 - mean2) / pooled_sd(sample1, sample2, True)


[docs]def g_durlak_bias(g: float, n: float) -> float:
	"""
	Application of Durlak's bias correction to the Hedge's g statistic.

	n = n1+n2

	:param g: Hedge's g-Statistic, calculated using :func:`~.g_hedge`.
	:param n: The total number of samples in both datasets.

	.. seealso:: https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
	"""

	durlak = ((n - 3) / (n - 2.25)) * numpy.sqrt((n - 2) / n)
	return g * durlak


[docs]def interpret_d(d_or_g: float) -> str:
	"""
	Interpret Cohen's d or Hedge's g values using Table 1
	from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3444174/

	:param d_or_g:
	:type d_or_g:
	"""  # noqa: D400

	if 0.0 <= d_or_g < 0.2:
		return "No Effect"
	elif 0.2 <= d_or_g < 0.5:
		return "Small Effect"
	elif 0.5 <= d_or_g < 0.8:
		return "Intermediate Effect"
	elif 0.8 <= d_or_g:
		return "Large Effect"
	else:  # d_or_g < 0
		return f"{interpret_d(numpy.abs(d_or_g)).split(' ')[0]} Adverse Effect"


def _contains_nan(a, nan_policy: NaNPolicies = "propagate"):
	policies = ["propagate", "raise", "omit"]
	if nan_policy not in policies:
		raise ValueError("nan_policy must be one of {%s}" % ", ".join(f"'{s}'" for s in policies))
	try:
		# Calling numpy.sum to avoid creating a huge array into memory
		# e.g. numpy.isnan(a).any()
		with numpy.errstate(invalid="ignore"):
			contains_nan = numpy.isnan(numpy.sum(a))
	except TypeError:
		# This can happen when attempting to sum things which are not
		# numbers (e.g. as in the function `mode`). Try an alternative method:
		try:
			contains_nan = numpy.nan in set(a.ravel())
		except TypeError:
			# Don't know what to do. Fall back to omitting nan values and
			# issue a warning.
			contains_nan = False
			nan_policy = "omit"
			warnings.warn(
					"The input array could not be properly checked for nan "
					"values. nan values will be ignored.",
					RuntimeWarning
					)

	if contains_nan and nan_policy == "raise":
		raise ValueError("The input contains nan values")

	return contains_nan, nan_policy


@overload
def median_absolute_deviation(
		x,
		axis: None,
		center: Callable = ...,
		scale: float = ...,
		nan_policy: NaNPolicies = ...,
		) -> float: ...


@overload
def median_absolute_deviation(
		x,
		axis: int = ...,
		center: Callable = ...,
		scale: float = ...,
		nan_policy: NaNPolicies = ...,
		) -> numpy.ndarray: ...


[docs]def median_absolute_deviation(
		x,
		axis: Optional[int] = 0,
		center: Callable = numpy.median,
		scale: float = 1.4826,
		nan_policy: NaNPolicies = "propagate"
		) -> Union[float, numpy.ndarray]:
	"""
	Compute the median absolute deviation of the data along the given axis.
	The median absolute deviation (MAD, [1]_) computes the median over the
	absolute deviations from the median. It is a measure of dispersion
	similar to the standard deviation, but is more robust to outliers [2]_.
	The MAD of an empty array is ``numpy.nan``.

	:param x: Input array or object that can be converted to an array.
	:type x: array_like
	:param axis: Axis along which the range is computed. If None, compute
		the MAD over the entire array.
	:param center: A function that will return the central value. The default is to use
		numpy.median. Any user defined function used will need to have the function
		signature ``func(arr, axis)``.
	:param scale: The scaling factor applied to the MAD. The default scale (1.4826)
		ensures consistency with the standard deviation for normally distributed
		data.
	:param nan_policy: Defines how to handle when input contains nan. 'propagate'
		returns nan, 'raise' throws an error, 'omit' performs the
		calculations ignoring nan values.

	:returns: If ``axis=None``, a scalar is returned. If the input contains
		integers or floats of smaller precision than :class:`numpy.float64`, then the
		output data-type is :class:`numpy.float64`. Otherwise, the output data-type is
		the same as that of the input.
	:rtype: scalar or ndarray

	.. note::

		The `center` argument only affects the calculation of the central value
		around which the MAD is calculated. That is, passing in ``center=numpy.mean``
		will calculate the MAD around the mean - it will not calculate the *mean*
		absolute deviation.

	**References**

	.. [1] "Median absolute deviation" https://en.wikipedia.org/wiki/Median_absolute_deviation
	.. [2] "Robust measures of scale" https://en.wikipedia.org/wiki/Robust_measures_of_scale

	**Examples**

	When comparing the behavior of `median_absolute_deviation` with ``numpy.std``,
	the latter is affected when we change a single value of an array to have an
	outlier value while the MAD hardly changes:

	.. code-block:: python

		>>> import scipy.stats
		>>> import mathematical.stats
		>>> x = scipy.stats.norm.rvs(size=100, scale=1, random_state=123456)
		>>> x.std()
		0.9973906394005013
		>>> mathematical.stats.median_absolute_deviation(x)
		1.2280762773108278
		>>> x[0] = 345.6
		>>> x.std()
		34.42304872314415
		>>> mathematical.stats.median_absolute_deviation(x)
		1.2340335571164334
		Axis handling example:
		>>> x = numpy.array([[10, 7, 4], [3, 2, 1]])
		>>> x
		array([[10,  7,  4], [ 3,  2,  1],])
		>>> mathematical.stats.median_absolute_deviation(x)
		array([5.1891, 3.7065, 2.2239])
		>>> mathematical.stats.median_absolute_deviation(x, axis=None)
		2.9652
	"""

	ad = absolute_deviation(x, axis=axis, center=center, nan_policy=nan_policy)

	if axis is None:
		mad = numpy.median(ad)
	else:
		mad = numpy.median(ad, axis=axis)

	return scale * mad


@overload
def absolute_deviation(
		x,
		axis: None,
		center: Callable = ...,
		nan_policy: NaNPolicies = ...,
		) -> float: ...


@overload
def absolute_deviation(
		x,
		axis: int = ...,
		center: Callable = ...,
		nan_policy: NaNPolicies = ...,
		) -> numpy.ndarray: ...


[docs]def absolute_deviation(
		x,
		axis: Optional[int] = 0,
		center: Callable = numpy.median,
		nan_policy: NaNPolicies = "propagate",
		) -> Union[float, numpy.ndarray]:
	"""
	Compute the absolute deviations from the median of the data along the given axis.

	:param x: Input array or object that can be converted to an array.
	:type x: array_like
	:param axis: Axis along which the range is computed. If None, compute
		the MAD over the entire array.
	:param center: A function that will return the central value. The default is to use
		numpy.median. Any user defined function used will need to have the function
		signature ``func(arr, axis)``.
	:param nan_policy: Defines how to handle when input contains nan. 'propagate'
		returns nan, 'raise' throws an error, 'omit' performs the
		calculations ignoring nan values.

	:returns: If ``axis=None``, a scalar is returned. If the input contains
		integers or floats of smaller precision than :class:`numpy.float64`, then the
		output data-type is :class:`numpy.float64`. Otherwise, the output data-type is
		the same as that of the input.
	:rtype: scalar or ndarray

	.. note::

		The `center` argument only affects the calculation of the central value
		around which the MAD is calculated. That is, passing in ``center=numpy.mean``
		will calculate the MAD around the mean - it will not calculate the *mean*
		absolute deviation.
	"""

	x = numpy.asarray(x)

	# Consistent with `numpy.var` and `numpy.std`.
	if not x.size:
		return numpy.nan

	contains_nan, nan_policy = _contains_nan(x, nan_policy)

	if contains_nan and nan_policy == "propagate":
		return numpy.nan

	if contains_nan and nan_policy == "omit":
		# Way faster than carrying the masks around
		arr = numpy.ma.masked_invalid(x).compressed()
	else:
		arr = x

	if axis is None:
		med = center(arr)
		ad = numpy.abs(arr - med)
	else:
		med = numpy.apply_over_axes(center, arr, axis)
		ad = numpy.abs(arr - med)

	return ad


@overload
def absolute_deviation_from_median(
		x,
		axis: None,
		center: Callable = ...,
		nan_policy: NaNPolicies = ...,
		) -> float: ...


@overload
def absolute_deviation_from_median(
		x,
		axis: int = ...,
		center: Callable = ...,
		nan_policy: NaNPolicies = ...,
		) -> numpy.ndarray: ...


[docs]def absolute_deviation_from_median(
		x,
		axis: Optional[int] = 0,
		center: Callable = numpy.median,
		nan_policy: NaNPolicies = "propagate",
		) -> Union[float, numpy.ndarray]:
	"""
	Compute the absolute deviation from the median of each point in the data
	along the given axis, given in terms of the MAD.

	.. seealso::

		https://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/

	:param x: Input array or object that can be converted to an array.
	:type x: array_like
	:param axis: Axis along which the range is computed. If None, compute
		the MAD over the entire array.
	:param center: A function that will return the central value. The default is to use
		numpy.median. Any user defined function used will need to have the function
		signature ``func(arr, axis)``.
	:param nan_policy: Defines how to handle when input contains nan. 'propagate'
		returns nan, 'raise' throws an error, 'omit' performs the
		calculations ignoring nan values.

	:returns: If ``axis=None``, a scalar is returned. If the input contains
		integers or floats of smaller precision than :class:`numpy.float64`, then the
		output data-type is :class:`numpy.float64`. Otherwise, the output data-type is
		the same as that of the input.
	:rtype: scalar or ndarray

	.. note::

		The `center` argument only affects the calculation of the central value
		around which the MAD is calculated. That is, passing in ``center=numpy.mean``
		will calculate the MAD around the mean - it will not calculate the *mean*
		absolute deviation.
	"""  # noqa: D400

	ad = absolute_deviation(x, axis=axis, center=center, nan_policy=nan_policy)

	if axis is None:
		mad = numpy.median(ad)
	else:
		mad = numpy.median(ad, axis=axis)

	return ad / mad


[docs]def within1min(value1: float, value2: float) -> bool:
	"""
	Returns whether ``value2`` is within one minute of ``value1``.

	:param value1: A time in minutes.
	:param value2: Another time in minutes.
	"""

	if value1 and value2:
		return (float(value1) - 1) < (float(value2)) < (float(value1) + 1)
	else:
		return False