update

2024-10-07 10:13:40 +07:00
parent aa1631742f
commit 3a7d696db6
9729 changed files with 1832837 additions and 161742 deletions
--- a/.CondaPkg/env/Lib/site-packages/scipy/stats/_odds_ratio.py
+++ b/.CondaPkg/env/Lib/site-packages/scipy/stats/_odds_ratio.py
@@ -0,0 +1,482 @@
+import numpy as np
+
+from scipy.special import ndtri
+from scipy.optimize import brentq
+from ._discrete_distns import nchypergeom_fisher
+from ._common import ConfidenceInterval
+
+
+def _sample_odds_ratio(table):
+    """
+    Given a table [[a, b], [c, d]], compute a*d/(b*c).
+
+    Return nan if the numerator and denominator are 0.
+    Return inf if just the denominator is 0.
+    """
+    # table must be a 2x2 numpy array.
+    if table[1, 0] > 0 and table[0, 1] > 0:
+        oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1])
+    elif table[0, 0] == 0 or table[1, 1] == 0:
+        oddsratio = np.nan
+    else:
+        oddsratio = np.inf
+    return oddsratio
+
+
+def _solve(func):
+    """
+    Solve func(nc) = 0.  func must be an increasing function.
+    """
+    # We could just as well call the variable `x` instead of `nc`, but we
+    # always call this function with functions for which nc (the noncentrality
+    # parameter) is the variable for which we are solving.
+    nc = 1.0
+    value = func(nc)
+    if value == 0:
+        return nc
+
+    # Multiplicative factor by which to increase or decrease nc when
+    # searching for a bracketing interval.
+    factor = 2.0
+    # Find a bracketing interval.
+    if value > 0:
+        nc /= factor
+        while func(nc) > 0:
+            nc /= factor
+        lo = nc
+        hi = factor*nc
+    else:
+        nc *= factor
+        while func(nc) < 0:
+            nc *= factor
+        lo = nc/factor
+        hi = nc
+
+    # lo and hi bracket the solution for nc.
+    nc = brentq(func, lo, hi, xtol=1e-13)
+    return nc
+
+
+def _nc_hypergeom_mean_inverse(x, M, n, N):
+    """
+    For the given noncentral hypergeometric parameters x, M, n,and N
+    (table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2
+    contingency table), find the noncentrality parameter of Fisher's
+    noncentral hypergeometric distribution whose mean is x.
+    """
+    nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x)
+    return nc
+
+
+def _hypergeom_params_from_table(table):
+    # The notation M, n and N is consistent with stats.hypergeom and
+    # stats.nchypergeom_fisher.
+    x = table[0, 0]
+    M = table.sum()
+    n = table[0].sum()
+    N = table[:, 0].sum()
+    return x, M, n, N
+
+
+def _ci_upper(table, alpha):
+    """
+    Compute the upper end of the confidence interval.
+    """
+    if _sample_odds_ratio(table) == np.inf:
+        return np.inf
+
+    x, M, n, N = _hypergeom_params_from_table(table)
+
+    # nchypergeom_fisher.cdf is a decreasing function of nc, so we negate
+    # it in the lambda expression.
+    nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha)
+    return nc
+
+
+def _ci_lower(table, alpha):
+    """
+    Compute the lower end of the confidence interval.
+    """
+    if _sample_odds_ratio(table) == 0:
+        return 0
+
+    x, M, n, N = _hypergeom_params_from_table(table)
+
+    nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha)
+    return nc
+
+
+def _conditional_oddsratio(table):
+    """
+    Conditional MLE of the odds ratio for the 2x2 contingency table.
+    """
+    x, M, n, N = _hypergeom_params_from_table(table)
+    # Get the bounds of the support.  The support of the noncentral
+    # hypergeometric distribution with parameters M, n, and N is the same
+    # for all values of the noncentrality parameter, so we can use 1 here.
+    lo, hi = nchypergeom_fisher.support(M, n, N, 1)
+
+    # Check if x is at one of the extremes of the support.  If so, we know
+    # the odds ratio is either 0 or inf.
+    if x == lo:
+        # x is at the low end of the support.
+        return 0
+    if x == hi:
+        # x is at the high end of the support.
+        return np.inf
+
+    nc = _nc_hypergeom_mean_inverse(x, M, n, N)
+    return nc
+
+
+def _conditional_oddsratio_ci(table, confidence_level=0.95,
+                              alternative='two-sided'):
+    """
+    Conditional exact confidence interval for the odds ratio.
+    """
+    if alternative == 'two-sided':
+        alpha = 0.5*(1 - confidence_level)
+        lower = _ci_lower(table, alpha)
+        upper = _ci_upper(table, alpha)
+    elif alternative == 'less':
+        lower = 0.0
+        upper = _ci_upper(table, 1 - confidence_level)
+    else:
+        # alternative == 'greater'
+        lower = _ci_lower(table, 1 - confidence_level)
+        upper = np.inf
+
+    return lower, upper
+
+
+def _sample_odds_ratio_ci(table, confidence_level=0.95,
+                          alternative='two-sided'):
+    oddsratio = _sample_odds_ratio(table)
+    log_or = np.log(oddsratio)
+    se = np.sqrt((1/table).sum())
+    if alternative == 'less':
+        z = ndtri(confidence_level)
+        loglow = -np.inf
+        loghigh = log_or + z*se
+    elif alternative == 'greater':
+        z = ndtri(confidence_level)
+        loglow = log_or - z*se
+        loghigh = np.inf
+    else:
+        # alternative is 'two-sided'
+        z = ndtri(0.5*confidence_level + 0.5)
+        loglow = log_or - z*se
+        loghigh = log_or + z*se
+
+    return np.exp(loglow), np.exp(loghigh)
+
+
+class OddsRatioResult:
+    """
+    Result of `scipy.stats.contingency.odds_ratio`.  See the
+    docstring for `odds_ratio` for more details.
+
+    Attributes
+    ----------
+    statistic : float
+        The computed odds ratio.
+
+        * If `kind` is ``'sample'``, this is sample (or unconditional)
+          estimate, given by
+          ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
+        * If `kind` is ``'conditional'``, this is the conditional
+          maximum likelihood estimate for the odds ratio. It is
+          the noncentrality parameter of Fisher's noncentral
+          hypergeometric distribution with the same hypergeometric
+          parameters as `table` and whose mean is ``table[0, 0]``.
+
+    Methods
+    -------
+    confidence_interval :
+        Confidence interval for the odds ratio.
+    """
+
+    def __init__(self, _table, _kind, statistic):
+        # for now, no need to make _table and _kind public, since this sort of
+        # information is returned in very few `scipy.stats` results
+        self._table = _table
+        self._kind = _kind
+        self.statistic = statistic
+
+    def __repr__(self):
+        return f"OddsRatioResult(statistic={self.statistic})"
+
+    def confidence_interval(self, confidence_level=0.95,
+                            alternative='two-sided'):
+        """
+        Confidence interval for the odds ratio.
+
+        Parameters
+        ----------
+        confidence_level: float
+            Desired confidence level for the confidence interval.
+            The value must be given as a fraction between 0 and 1.
+            Default is 0.95 (meaning 95%).
+
+        alternative : {'two-sided', 'less', 'greater'}, optional
+            The alternative hypothesis of the hypothesis test to which the
+            confidence interval corresponds. That is, suppose the null
+            hypothesis is that the true odds ratio equals ``OR`` and the
+            confidence interval is ``(low, high)``. Then the following options
+            for `alternative` are available (default is 'two-sided'):
+
+            * 'two-sided': the true odds ratio is not equal to ``OR``. There
+              is evidence against the null hypothesis at the chosen
+              `confidence_level` if ``high < OR`` or ``low > OR``.
+            * 'less': the true odds ratio is less than ``OR``. The ``low`` end
+              of the confidence interval is 0, and there is evidence against
+              the null hypothesis at  the chosen `confidence_level` if
+              ``high < OR``.
+            * 'greater': the true odds ratio is greater than ``OR``.  The
+              ``high`` end of the confidence interval is ``np.inf``, and there
+              is evidence against the null hypothesis at the chosen
+              `confidence_level` if ``low > OR``.
+
+        Returns
+        -------
+        ci : ``ConfidenceInterval`` instance
+            The confidence interval, represented as an object with
+            attributes ``low`` and ``high``.
+
+        Notes
+        -----
+        When `kind` is ``'conditional'``, the limits of the confidence
+        interval are the conditional "exact confidence limits" as described
+        by Fisher [1]_. The conditional odds ratio and confidence interval are
+        also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_.
+
+        When `kind` is ``'sample'``, the confidence interval is computed
+        under the assumption that the logarithm of the odds ratio is normally
+        distributed with standard error given by::
+
+            se = sqrt(1/a + 1/b + 1/c + 1/d)
+
+        where ``a``, ``b``, ``c`` and ``d`` are the elements of the
+        contingency table.  (See, for example, [2]_, section 3.1.3.2,
+        or [3]_, section 2.3.3).
+
+        References
+        ----------
+        .. [1] R. A. Fisher (1935), The logic of inductive inference,
+               Journal of the Royal Statistical Society, Vol. 98, No. 1,
+               pp. 39-82.
+        .. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
+               Methods, Techniques, and Applications, CRC Press LLC, Boca
+               Raton, Florida.
+        .. [3] Alan Agresti, An Introduction to Categorical Data Analysis
+               (second edition), Wiley, Hoboken, NJ, USA (2007).
+        """
+        if alternative not in ['two-sided', 'less', 'greater']:
+            raise ValueError("`alternative` must be 'two-sided', 'less' or "
+                             "'greater'.")
+
+        if confidence_level < 0 or confidence_level > 1:
+            raise ValueError('confidence_level must be between 0 and 1')
+
+        if self._kind == 'conditional':
+            ci = self._conditional_odds_ratio_ci(confidence_level, alternative)
+        else:
+            ci = self._sample_odds_ratio_ci(confidence_level, alternative)
+        return ci
+
+    def _conditional_odds_ratio_ci(self, confidence_level=0.95,
+                                   alternative='two-sided'):
+        """
+        Confidence interval for the conditional odds ratio.
+        """
+
+        table = self._table
+        if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
+            # If both values in a row or column are zero, the p-value is 1,
+            # the odds ratio is NaN and the confidence interval is (0, inf).
+            ci = (0, np.inf)
+        else:
+            ci = _conditional_oddsratio_ci(table,
+                                           confidence_level=confidence_level,
+                                           alternative=alternative)
+        return ConfidenceInterval(low=ci[0], high=ci[1])
+
+    def _sample_odds_ratio_ci(self, confidence_level=0.95,
+                              alternative='two-sided'):
+        """
+        Confidence interval for the sample odds ratio.
+        """
+        if confidence_level < 0 or confidence_level > 1:
+            raise ValueError('confidence_level must be between 0 and 1')
+
+        table = self._table
+        if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
+            # If both values in a row or column are zero, the p-value is 1,
+            # the odds ratio is NaN and the confidence interval is (0, inf).
+            ci = (0, np.inf)
+        else:
+            ci = _sample_odds_ratio_ci(table,
+                                       confidence_level=confidence_level,
+                                       alternative=alternative)
+        return ConfidenceInterval(low=ci[0], high=ci[1])
+
+
+def odds_ratio(table, *, kind='conditional'):
+    r"""
+    Compute the odds ratio for a 2x2 contingency table.
+
+    Parameters
+    ----------
+    table : array_like of ints
+        A 2x2 contingency table.  Elements must be non-negative integers.
+    kind : str, optional
+        Which kind of odds ratio to compute, either the sample
+        odds ratio (``kind='sample'``) or the conditional odds ratio
+        (``kind='conditional'``).  Default is ``'conditional'``.
+
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.OddsRatioResult` instance
+        The returned object has two computed attributes:
+
+        statistic : float
+            * If `kind` is ``'sample'``, this is sample (or unconditional)
+              estimate, given by
+              ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
+            * If `kind` is ``'conditional'``, this is the conditional
+              maximum likelihood estimate for the odds ratio. It is
+              the noncentrality parameter of Fisher's noncentral
+              hypergeometric distribution with the same hypergeometric
+              parameters as `table` and whose mean is ``table[0, 0]``.
+
+        The object has the method `confidence_interval` that computes
+        the confidence interval of the odds ratio.
+
+    See Also
+    --------
+    scipy.stats.fisher_exact
+    relative_risk
+
+    Notes
+    -----
+    The conditional odds ratio was discussed by Fisher (see "Example 1"
+    of [1]_).  Texts that cover the odds ratio include [2]_ and [3]_.
+
+    .. versionadded:: 1.10.0
+
+    References
+    ----------
+    .. [1] R. A. Fisher (1935), The logic of inductive inference,
+           Journal of the Royal Statistical Society, Vol. 98, No. 1,
+           pp. 39-82.
+    .. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research.
+           Volume I - The analysis of case-control studies. IARC Sci Publ.
+           (32):5-338. PMID: 7216345. (See section 4.2.)
+    .. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
+           Methods, Techniques, and Applications, CRC Press LLC, Boca
+           Raton, Florida.
+    .. [4] Berger, Jeffrey S. et al. "Aspirin for the Primary Prevention of
+           Cardiovascular Events in Women and Men: A Sex-Specific
+           Meta-analysis of Randomized Controlled Trials."
+           JAMA, 295(3):306-313, :doi:`10.1001/jama.295.3.306`, 2006.
+
+    Examples
+    --------
+    In epidemiology, individuals are classified as "exposed" or
+    "unexposed" to some factor or treatment. If the occurrence of some
+    illness is under study, those who have the illness are often
+    classified as "cases", and those without it are "noncases".  The
+    counts of the occurrences of these classes gives a contingency
+    table::
+
+                    exposed    unexposed
+        cases          a           b
+        noncases       c           d
+
+    The sample odds ratio may be written ``(a/c) / (b/d)``.  ``a/c`` can
+    be interpreted as the odds of a case occurring in the exposed group,
+    and ``b/d`` as the odds of a case occurring in the unexposed group.
+    The sample odds ratio is the ratio of these odds.  If the odds ratio
+    is greater than 1, it suggests that there is a positive association
+    between being exposed and being a case.
+
+    Interchanging the rows or columns of the contingency table inverts
+    the odds ratio, so it is important to understand the meaning of labels
+    given to the rows and columns of the table when interpreting the
+    odds ratio.
+
+    In [4]_, the use of aspirin to prevent cardiovascular events in women
+    and men was investigated. The study notably concluded:
+
+        ...aspirin therapy reduced the risk of a composite of
+        cardiovascular events due to its effect on reducing the risk of
+        ischemic stroke in women [...]
+
+    The article lists studies of various cardiovascular events. Let's
+    focus on the ischemic stoke in women.
+
+    The following table summarizes the results of the experiment in which
+    participants took aspirin or a placebo on a regular basis for several
+    years. Cases of ischemic stroke were recorded::
+
+                          Aspirin   Control/Placebo
+        Ischemic stroke     176           230
+        No stroke         21035         21018
+
+    The question we ask is "Is there evidence that the aspirin reduces the
+    risk of ischemic stroke?"
+
+    Compute the odds ratio:
+
+    >>> from scipy.stats.contingency import odds_ratio
+    >>> res = odds_ratio([[176, 230], [21035, 21018]])
+    >>> res.statistic
+    0.7646037659999126
+
+    For this sample, the odds of getting an ischemic stroke for those who have
+    been taking aspirin are 0.76 times that of those
+    who have received the placebo.
+
+    To make statistical inferences about the population under study,
+    we can compute the 95% confidence interval for the odds ratio:
+
+    >>> res.confidence_interval(confidence_level=0.95)
+    ConfidenceInterval(low=0.6241234078749812, high=0.9354102892100372)
+
+    The 95% confidence interval for the conditional odds ratio is
+    approximately (0.62, 0.94).
+
+    The fact that the entire 95% confidence interval falls below 1 supports
+    the authors' conclusion that the aspirin was associated with a
+    statistically significant reduction in ischemic stroke.
+    """
+    if kind not in ['conditional', 'sample']:
+        raise ValueError("`kind` must be 'conditional' or 'sample'.")
+
+    c = np.asarray(table)
+
+    if c.shape != (2, 2):
+        raise ValueError(f"Invalid shape {c.shape}. The input `table` must be "
+                         "of shape (2, 2).")
+
+    if not np.issubdtype(c.dtype, np.integer):
+        raise ValueError("`table` must be an array of integers, but got "
+                         f"type {c.dtype}")
+    c = c.astype(np.int64)
+
+    if np.any(c < 0):
+        raise ValueError("All values in `table` must be nonnegative.")
+
+    if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
+        # If both values in a row or column are zero, the p-value is NaN and
+        # the odds ratio is NaN.
+        result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan)
+        return result
+
+    if kind == 'sample':
+        oddsratio = _sample_odds_ratio(c)
+    else:  # kind is 'conditional'
+        oddsratio = _conditional_oddsratio(c)
+
+    result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio)
+    return result