Source code for mathematical.linear_regression

#!/usr/bin/env python3
#
#  linear_regression.py
"""
Functions for performing linear regression.
"""
#
#  Copyright © 2020 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#  Based on Pyteomics (https://github.com/levitsky/pyteomics)
#  |  Copyright (c) 2011-2015, Anton Goloborodko & Lev Levitsky
#  |  Licensed under the Apache License, Version 2.0 (the "License");
#  |  you may not use this file except in compliance with the License.
#  |  You may obtain a copy of the License at
#  |
#  |    http://www.apache.org/licenses/LICENSE-2.0
#  |
#  |  Unless required by applicable law or agreed to in writing, software
#  |  distributed under the License is distributed on an "AS IS" BASIS,
#  |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  |  See the License for the specific language governing permissions and
#  |  limitations under the License.
#  |
#  |  See also:
#  |  Goloborodko, A.A.; Levitsky, L.I.; Ivanov, M.V.; and Gorshkov, M.V. (2013)
#  |  "Pyteomics - a Python Framework for Exploratory Data Analysis and Rapid Software
#  |  Prototyping in Proteomics", Journal of The American Society for Mass Spectrometry,
#  |  24(2), 301–304. DOI: `10.1007/s13361-012-0516-6 <http://dx.doi.org/10.1007/s13361-012-0516-6>`_
#  |
#  |  Levitsky, L.I.; Klein, J.; Ivanov, M.V.; and Gorshkov, M.V. (2018)
#  |  "Pyteomics 4.0: five years of development of a Python proteomics framework",
#  |  Journal of Proteome Research.
#  |  DOI: `10.1021/acs.jproteome.8b00717 <http://dx.doi.org/10.1021/acs.jproteome.8b00717>`_
#

# stdlib
from typing import Optional, Sequence, Tuple, Union

# 3rd party
import numpy

__all__ = ["linear_regression_vertical", "linear_regression_perpendicular", "ArrayLike_Float"]

#: Type hint for arguments that take either a sequence of floats or a numpy array.
ArrayLike_Float = Union[Sequence[float], numpy.ndarray]


[docs]def linear_regression_vertical( x: ArrayLike_Float, y: Optional[ArrayLike_Float] = None, a: Optional[float] = None, b: Optional[float] = None, ) -> Tuple[float, float, float, float]: """ Calculate coefficients of a linear regression y = a * x + b. The fit minimizes *vertical* distances between the points and the line. :param x: 1-D array of floats :param y: 1-D array of floats :param a: If specified then the slope coefficient is fixed as this value :param b: If specified then the free term is fixed as this value If `y` is omitted, `x` must be a 2-D array of shape (N, 2). :return: (a, b, r, stderr), where a -- slope coefficient, b -- free term, r -- Pearson correlation coefficient, stderr -- standard deviation. """ x = numpy.array(x, copy=False) y_array: numpy.ndarray if y is not None: y_array = numpy.array(y, copy=False) else: if len(x.shape) != 2 or x.shape[-1] != 2: raise TypeError(f"If `y` is not given, x.shape should be (N, 2), given: {x.shape}") y_array = x[:, 1] x = x[:, 0] if a is not None and b is None: y_array - a * x # type: ignore b = (y_array - a * x).mean() # type: ignore elif a is not None and b is not None: pass else: a, b = numpy.polyfit(x, y_array, 1) r = numpy.corrcoef(x, y_array)[0, 1] stderr = (y_array - a * x - b).std() # type: ignore return a, b, r, stderr # type: ignore # TODO
linear_regression = linear_regression_vertical
[docs]def linear_regression_perpendicular( x: ArrayLike_Float, y: Optional[ArrayLike_Float] = None, ) -> Tuple[float, float, float, float]: """ Calculate coefficients of a linear regression y = a * x + b. The fit minimizes *perpendicular* distances between the points and the line. :param x: 1-D array of floats. :param y: 1-D array of floats. If `y` is omitted, `x` must be a 2-D array of shape (N, 2). :return: (a, b, r, stderr), where a -- slope coefficient, b -- free term, r -- Peason correlation coefficient, stderr -- standard deviation. """ x = numpy.array(x, copy=False) if y is not None: y = numpy.array(y, copy=False) data = numpy.hstack((x.reshape((-1, 1)), y.reshape((-1, 1)))) else: if len(x.shape) != 2 or x.shape[-1] != 2: raise TypeError(f"If `y` is not given, x.shape should be (N, 2), given: {x.shape}") data = x mu = data.mean(axis=0) eigenvectors, eigenvalues, V = numpy.linalg.svd((data - mu).T, full_matrices=False) a = eigenvectors[0][1] / eigenvectors[0][0] xm, ym = data.mean(axis=0) b = ym - a * xm r = numpy.corrcoef(data[:, 0], data[:, 1])[0, 1] stderr = ((data[:, 1] - a * data[:, 0] - b) / numpy.sqrt(a**2 + 1)).std() return a, b, r, stderr