Source code for mathematical.data_frames

#!/usr/bin/env python
#
#  date_frames.py
"""
Mathematical operations for :class:`Data Frames <pandas.DataFrame>`.
"""
#
#  Copyright © 2019-2020 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  Based on
# 		http://jonathansoma.com/lede/foundations/classes/pandas%20columns%20and%20functions/apply-a-function-to-every-row-in-a-pandas-dataframe/
# 		Copyright 2016 Jonathan Soma
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#

# stdlib
from math import log, log10
from typing import List, Optional, Sequence

# 3rd party
import numpy
import pandas  # type: ignore

# this package
from mathematical import outliers

pandas.DataFrame.__module__ = "pandas"
pandas.Series.__module__ = "pandas"

__all__ = [
		"df_mean",
		"df_median",
		"df_stdev",
		"df_log_stdev",
		"df_percentage",
		"df_log",
		"df_data_points",
		"df_outliers",
		"df_count",
		"df_delta",
		"df_delta_relative",
		"ColumnLabelList",
		"set_display_options",
		]

# Outlier Modes
MAD = 1
QUARTILES = 2
STDEV2 = 3

#: Type hint for the ``column_label_list`` parameter in the ``df_*()`` functions.
ColumnLabelList = Optional[Sequence[str]]


[docs]def df_mean(row: pandas.Series, column_label_list: ColumnLabelList = None) -> float: """ Calculate the mean of each row for the specified columns of a :class:`data frame <pandas.DataFrame>`. Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Mean"] = data_frame.apply( func=df_mean, args=[["Bob", "Alice"]], axis=1, ) :param row: Row of the data frame. :param column_label_list: List of column labels to calculate the mean for. :return: The mean """ # noqa: D400 if column_label_list is None: column_label_list = list(row.index) return float(numpy.nanmean(tuple(row[column_label_list])))
[docs]def df_median(row: pandas.Series, column_label_list: ColumnLabelList = None) -> float: """ Calculate the median of each row for the specified columns of a :class:`data frame <pandas.DataFrame>`. Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Median"] = data_frame.apply( func=df_median, args=[["Bob", "Alice"]], axis=1, ) :param row: Row of the data frame. :param column_label_list: List of column labels to calculate median for. :return: The median """ # noqa: D400 if column_label_list is None: column_label_list = list(row.index) return float(numpy.nanmedian(tuple(row[column_label_list])))
[docs]def df_stdev(row: pandas.Series, column_label_list: ColumnLabelList = None) -> float: """ Calculate the standard deviation of each row for the specified columns of a :class:`data frame <pandas.DataFrame>`. Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Stdev"] = data_frame.apply( func=df_stdev, args=[["Bob", "Alice"]], axis=1, ) :param row: Row of the data frame. :param column_label_list: List of column labels to calculate standard deviation for. :return: The standard deviation """ # noqa: D400 if column_label_list is None: column_label_list = list(row.index) return float(numpy.nanstd(tuple(row[column_label_list])))
[docs]def df_log_stdev(row: pandas.Series, column_label_list: ColumnLabelList = None) -> float: """ Calculate the standard deviation of the log10 values in each row for the specified columns of a :class:`data frame <pandas.DataFrame>`. Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Log Stdev"] = data_frame.apply( func=df_log_stdev, args=[["Bob", "Alice"]], axis=1, ) :param row: Row of the data frame. :param column_label_list: List of column labels to calculate standard deviation for. :return: The standard deviation """ # noqa: D400 if column_label_list is None: column_label_list = list(row.index) log_values = [log10(x) if x > 0.0 else numpy.nan for x in row[column_label_list]] return float(numpy.nanstd(log_values))
[docs]def df_percentage(row: pandas.Series, column_label: str, total: float) -> float: """ Returns the value of the specified column as a percentage of the given total. The total is usually the sum of the specified column. Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Bob Percentage"] = data_frame.apply( func=df_percentage, args=[13, "Bob"], axis=1, ) :param row: Row of the data frame. :param column_label: The column to calculate percentage for. :param total: The total value. :return: Percentage * 100 """ return (row[column_label] / float(total)) * 100.0
[docs]def df_log(row: pandas.Series, column_label_list: Sequence[str], base: float = 10) -> float: """ Calculate the logarithm of the values in each row for the specified columns of a :class:`data frame <pandas.DataFrame>`. Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Bob Log10"] = data_frame.apply( func=df_log, args=[["Bob"], 10], axis=1, ) :param row: Row of the data frame. :param column_label_list: List of column labels to calculate log for. :param base: The logarithmic base. :return: The logarithmic value. """ # noqa: D400 if all(row[column_label_list][i] > 0.0 for i in range(len(row[column_label_list]))): return log(row[column_label_list], base) else: return 0
[docs]def df_data_points(row: pandas.Series, column_label_list: Sequence[str]) -> List: """ Compile the values for the specified columns in each row into a list. Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Data Points"] = data_frame.apply( func=df_data_points, args=[["Bob", "Alice"]], axis=1, ) :param row: Row of the data frame. :param column_label_list: List of column labels to calculate standard deviation for. :return: The number of data points. """ return [row[column_label] for column_label in column_label_list]
[docs]def df_outliers( row: pandas.Series, column_label_list: ColumnLabelList = None, outlier_mode: int = MAD, ) -> List: """ Identify outliers in each row. This function only returns the list of outliers (if any). If you want the list of values without the outliers see the functions in :mod:`mathematical.outliers`. Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Outliers"] = data_frame.apply( func=df_outliers, args=[["Bob", "Alice"]], axis=1, ) :param row: Row of the data frame. :param column_label_list: List of column labels to determine outliers for. :param outlier_mode: outlier detection method to use. The supported outlier modes are: * ``1`` or :py:data`mathematical.data_frames.MAD` -- Use the Median Absolute Deviation * ``2`` or :py:data`mathematical.data_frames.QUARTILES` -- Treat values more than ``3×`` the inter-quartile range away from the upper or lower quartile as outliers. * ``3`` or :py:data`mathematical.data_frames.STDEV2` -- Treat values more than ``rng × stdev`` away from mean as outliers :return: The outliers. """ if column_label_list is None: column_label_list = list(row.index) data = row[column_label_list] if all(x == 0.0 for x in data): return [] if outlier_mode == MAD: x = outliers.mad_outliers(data)[0] elif outlier_mode == QUARTILES: x = outliers.quartile_outliers(data)[0] elif outlier_mode == STDEV2: # outlier classed as more than 2 stdev away from mean x = outliers.stdev_outlier(data, rng=2)[0] else: raise ValueError("Unknown outlier mode.") return list(x)
[docs]def df_count(row: pandas.Series, column_label_list: ColumnLabelList = None) -> int: """ Count the number of occurrences of a non-NaN value in the specified columns of a :class:`data frame <pandas.DataFrame>`. Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Count"] = data_frame.apply( func=df_count, args=[["Bob", "Alice"]], axis=1, ) :param row: Row of the data frame. :param column_label_list: List of column labels to count occurrences in. :return: Count of the occurrences of non-NaN values. """ # noqa: D400 if column_label_list is None: column_label_list = list(row.index) count = 0 for column_label in column_label_list: if row[column_label] and not numpy.isnan(row[column_label]): count += 1 return count
[docs]def df_delta(row: pandas.Series, left_column: str, right_column: str) -> float: """ Calculate the difference between values in the two columns for each row of a :class:`data frame <pandas.DataFrame>`. Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Delta"] = data_frame.apply( func=df_delta, args=["Bob", "Alice"], axis=1, ) :param row: Row of the data frame. :param left_column: :param right_column: :return: The difference between ``left_column`` and ``right_column``. .. versionadded:: 0.4.0 """ # noqa: D400 return row[left_column] - row[right_column]
[docs]def df_delta_relative(row: pandas.Series, left_column: str, right_column: str) -> float: """ Calculate the relative difference between values in the two columns for each row of a :class:`data frame <pandas.DataFrame>`:: (left - right) / right Do not call this function directly; use it with :meth:`df.apply() <pandas.DataFrame.apply>` instead: .. code-block:: python data_frame["Rel. Delta"] = data_frame.apply( func=df_delta_relative, args=["Bob", "Alice"], axis=1, ) :param row: Row of the data frame. :param left_column: :param right_column: :return: The relative difference between ``left_column`` and ``right_column``. .. versionadded:: 0.4.0 """ # noqa: D400 right = row[right_column] if right: return (row[left_column] - right) / right else: return float("inf")
[docs]def set_display_options(desired_width: int = 300, max_columns: int = 15, max_rows: int = 20): """ Set the display options for numpy and pandas. :param desired_width: The desired maximum output width, in characters. :param max_columns: The maximum number of columns to display in a :class:`pandas.DataFrame`. :param max_rows: The maximum number of rows to display in a :class:`pandas.DataFrame`. .. versionadded:: 0.3.0 """ pandas.set_option("display.width", desired_width) numpy.set_printoptions(linewidth=desired_width) pandas.options.display.max_columns = max_columns pandas.options.display.max_rows = max_rows