using for loop to install conda package
This commit is contained in:
515
.CondaPkg/env/Lib/site-packages/scipy/stats/__init__.py
vendored
Normal file
515
.CondaPkg/env/Lib/site-packages/scipy/stats/__init__.py
vendored
Normal file
@@ -0,0 +1,515 @@
|
||||
"""
|
||||
.. _statsrefmanual:
|
||||
|
||||
==========================================
|
||||
Statistical functions (:mod:`scipy.stats`)
|
||||
==========================================
|
||||
|
||||
.. currentmodule:: scipy.stats
|
||||
|
||||
This module contains a large number of probability distributions,
|
||||
summary and frequency statistics, correlation functions and statistical
|
||||
tests, masked statistics, kernel density estimation, quasi-Monte Carlo
|
||||
functionality, and more.
|
||||
|
||||
Statistics is a very large area, and there are topics that are out of scope
|
||||
for SciPy and are covered by other packages. Some of the most important ones
|
||||
are:
|
||||
|
||||
- `statsmodels <https://www.statsmodels.org/stable/index.html>`__:
|
||||
regression, linear models, time series analysis, extensions to topics
|
||||
also covered by ``scipy.stats``.
|
||||
- `Pandas <https://pandas.pydata.org/>`__: tabular data, time series
|
||||
functionality, interfaces to other statistical languages.
|
||||
- `PyMC <https://docs.pymc.io/>`__: Bayesian statistical
|
||||
modeling, probabilistic machine learning.
|
||||
- `scikit-learn <https://scikit-learn.org/>`__: classification, regression,
|
||||
model selection.
|
||||
- `Seaborn <https://seaborn.pydata.org/>`__: statistical data visualization.
|
||||
- `rpy2 <https://rpy2.github.io/>`__: Python to R bridge.
|
||||
|
||||
|
||||
Probability distributions
|
||||
=========================
|
||||
|
||||
Each univariate distribution is an instance of a subclass of `rv_continuous`
|
||||
(`rv_discrete` for discrete distributions):
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rv_continuous
|
||||
rv_discrete
|
||||
rv_histogram
|
||||
|
||||
Continuous distributions
|
||||
------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
alpha -- Alpha
|
||||
anglit -- Anglit
|
||||
arcsine -- Arcsine
|
||||
argus -- Argus
|
||||
beta -- Beta
|
||||
betaprime -- Beta Prime
|
||||
bradford -- Bradford
|
||||
burr -- Burr (Type III)
|
||||
burr12 -- Burr (Type XII)
|
||||
cauchy -- Cauchy
|
||||
chi -- Chi
|
||||
chi2 -- Chi-squared
|
||||
cosine -- Cosine
|
||||
crystalball -- Crystalball
|
||||
dgamma -- Double Gamma
|
||||
dweibull -- Double Weibull
|
||||
erlang -- Erlang
|
||||
expon -- Exponential
|
||||
exponnorm -- Exponentially Modified Normal
|
||||
exponweib -- Exponentiated Weibull
|
||||
exponpow -- Exponential Power
|
||||
f -- F (Snecdor F)
|
||||
fatiguelife -- Fatigue Life (Birnbaum-Saunders)
|
||||
fisk -- Fisk
|
||||
foldcauchy -- Folded Cauchy
|
||||
foldnorm -- Folded Normal
|
||||
genlogistic -- Generalized Logistic
|
||||
gennorm -- Generalized normal
|
||||
genpareto -- Generalized Pareto
|
||||
genexpon -- Generalized Exponential
|
||||
genextreme -- Generalized Extreme Value
|
||||
gausshyper -- Gauss Hypergeometric
|
||||
gamma -- Gamma
|
||||
gengamma -- Generalized gamma
|
||||
genhalflogistic -- Generalized Half Logistic
|
||||
genhyperbolic -- Generalized Hyperbolic
|
||||
geninvgauss -- Generalized Inverse Gaussian
|
||||
gibrat -- Gibrat
|
||||
gilbrat -- Gilbrat
|
||||
gompertz -- Gompertz (Truncated Gumbel)
|
||||
gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
|
||||
gumbel_l -- Left Sided Gumbel, etc.
|
||||
halfcauchy -- Half Cauchy
|
||||
halflogistic -- Half Logistic
|
||||
halfnorm -- Half Normal
|
||||
halfgennorm -- Generalized Half Normal
|
||||
hypsecant -- Hyperbolic Secant
|
||||
invgamma -- Inverse Gamma
|
||||
invgauss -- Inverse Gaussian
|
||||
invweibull -- Inverse Weibull
|
||||
johnsonsb -- Johnson SB
|
||||
johnsonsu -- Johnson SU
|
||||
kappa4 -- Kappa 4 parameter
|
||||
kappa3 -- Kappa 3 parameter
|
||||
ksone -- Distribution of Kolmogorov-Smirnov one-sided test statistic
|
||||
kstwo -- Distribution of Kolmogorov-Smirnov two-sided test statistic
|
||||
kstwobign -- Limiting Distribution of scaled Kolmogorov-Smirnov two-sided test statistic.
|
||||
laplace -- Laplace
|
||||
laplace_asymmetric -- Asymmetric Laplace
|
||||
levy -- Levy
|
||||
levy_l
|
||||
levy_stable
|
||||
logistic -- Logistic
|
||||
loggamma -- Log-Gamma
|
||||
loglaplace -- Log-Laplace (Log Double Exponential)
|
||||
lognorm -- Log-Normal
|
||||
loguniform -- Log-Uniform
|
||||
lomax -- Lomax (Pareto of the second kind)
|
||||
maxwell -- Maxwell
|
||||
mielke -- Mielke's Beta-Kappa
|
||||
moyal -- Moyal
|
||||
nakagami -- Nakagami
|
||||
ncx2 -- Non-central chi-squared
|
||||
ncf -- Non-central F
|
||||
nct -- Non-central Student's T
|
||||
norm -- Normal (Gaussian)
|
||||
norminvgauss -- Normal Inverse Gaussian
|
||||
pareto -- Pareto
|
||||
pearson3 -- Pearson type III
|
||||
powerlaw -- Power-function
|
||||
powerlognorm -- Power log normal
|
||||
powernorm -- Power normal
|
||||
rdist -- R-distribution
|
||||
rayleigh -- Rayleigh
|
||||
rice -- Rice
|
||||
recipinvgauss -- Reciprocal Inverse Gaussian
|
||||
semicircular -- Semicircular
|
||||
skewcauchy -- Skew Cauchy
|
||||
skewnorm -- Skew normal
|
||||
studentized_range -- Studentized Range
|
||||
t -- Student's T
|
||||
trapezoid -- Trapezoidal
|
||||
triang -- Triangular
|
||||
truncexpon -- Truncated Exponential
|
||||
truncnorm -- Truncated Normal
|
||||
truncpareto -- Truncated Pareto
|
||||
truncweibull_min -- Truncated minimum Weibull distribution
|
||||
tukeylambda -- Tukey-Lambda
|
||||
uniform -- Uniform
|
||||
vonmises -- Von-Mises (Circular)
|
||||
vonmises_line -- Von-Mises (Line)
|
||||
wald -- Wald
|
||||
weibull_min -- Minimum Weibull (see Frechet)
|
||||
weibull_max -- Maximum Weibull (see Frechet)
|
||||
wrapcauchy -- Wrapped Cauchy
|
||||
|
||||
Multivariate distributions
|
||||
--------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
multivariate_normal -- Multivariate normal distribution
|
||||
matrix_normal -- Matrix normal distribution
|
||||
dirichlet -- Dirichlet
|
||||
wishart -- Wishart
|
||||
invwishart -- Inverse Wishart
|
||||
multinomial -- Multinomial distribution
|
||||
special_ortho_group -- SO(N) group
|
||||
ortho_group -- O(N) group
|
||||
unitary_group -- U(N) group
|
||||
random_correlation -- random correlation matrices
|
||||
multivariate_t -- Multivariate t-distribution
|
||||
multivariate_hypergeom -- Multivariate hypergeometric distribution
|
||||
random_table -- Distribution of random tables with given marginals
|
||||
uniform_direction -- Uniform distribution on S(N-1)
|
||||
|
||||
`scipy.stats.multivariate_normal` methods accept instances
|
||||
of the following class to represent the covariance.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
Covariance -- Representation of a covariance matrix
|
||||
|
||||
|
||||
Discrete distributions
|
||||
----------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
bernoulli -- Bernoulli
|
||||
betabinom -- Beta-Binomial
|
||||
binom -- Binomial
|
||||
boltzmann -- Boltzmann (Truncated Discrete Exponential)
|
||||
dlaplace -- Discrete Laplacian
|
||||
geom -- Geometric
|
||||
hypergeom -- Hypergeometric
|
||||
logser -- Logarithmic (Log-Series, Series)
|
||||
nbinom -- Negative Binomial
|
||||
nchypergeom_fisher -- Fisher's Noncentral Hypergeometric
|
||||
nchypergeom_wallenius -- Wallenius's Noncentral Hypergeometric
|
||||
nhypergeom -- Negative Hypergeometric
|
||||
planck -- Planck (Discrete Exponential)
|
||||
poisson -- Poisson
|
||||
randint -- Discrete Uniform
|
||||
skellam -- Skellam
|
||||
yulesimon -- Yule-Simon
|
||||
zipf -- Zipf (Zeta)
|
||||
zipfian -- Zipfian
|
||||
|
||||
An overview of statistical functions is given below. Many of these functions
|
||||
have a similar version in `scipy.stats.mstats` which work for masked arrays.
|
||||
|
||||
Summary statistics
|
||||
==================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
describe -- Descriptive statistics
|
||||
gmean -- Geometric mean
|
||||
hmean -- Harmonic mean
|
||||
pmean -- Power mean
|
||||
kurtosis -- Fisher or Pearson kurtosis
|
||||
mode -- Modal value
|
||||
moment -- Central moment
|
||||
expectile -- Expectile
|
||||
skew -- Skewness
|
||||
kstat --
|
||||
kstatvar --
|
||||
tmean -- Truncated arithmetic mean
|
||||
tvar -- Truncated variance
|
||||
tmin --
|
||||
tmax --
|
||||
tstd --
|
||||
tsem --
|
||||
variation -- Coefficient of variation
|
||||
find_repeats
|
||||
trim_mean
|
||||
gstd -- Geometric Standard Deviation
|
||||
iqr
|
||||
sem
|
||||
bayes_mvs
|
||||
mvsdist
|
||||
entropy
|
||||
differential_entropy
|
||||
median_abs_deviation
|
||||
|
||||
Frequency statistics
|
||||
====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
cumfreq
|
||||
percentileofscore
|
||||
scoreatpercentile
|
||||
relfreq
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
binned_statistic -- Compute a binned statistic for a set of data.
|
||||
binned_statistic_2d -- Compute a 2-D binned statistic for a set of data.
|
||||
binned_statistic_dd -- Compute a d-D binned statistic for a set of data.
|
||||
|
||||
Correlation functions
|
||||
=====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
f_oneway
|
||||
alexandergovern
|
||||
pearsonr
|
||||
spearmanr
|
||||
pointbiserialr
|
||||
kendalltau
|
||||
weightedtau
|
||||
somersd
|
||||
linregress
|
||||
siegelslopes
|
||||
theilslopes
|
||||
multiscale_graphcorr
|
||||
|
||||
Statistical tests
|
||||
=================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_1samp
|
||||
ttest_ind
|
||||
ttest_ind_from_stats
|
||||
ttest_rel
|
||||
chisquare
|
||||
cramervonmises
|
||||
cramervonmises_2samp
|
||||
power_divergence
|
||||
kstest
|
||||
ks_1samp
|
||||
ks_2samp
|
||||
epps_singleton_2samp
|
||||
mannwhitneyu
|
||||
tiecorrect
|
||||
rankdata
|
||||
ranksums
|
||||
wilcoxon
|
||||
kruskal
|
||||
friedmanchisquare
|
||||
brunnermunzel
|
||||
combine_pvalues
|
||||
jarque_bera
|
||||
page_trend_test
|
||||
tukey_hsd
|
||||
poisson_means_test
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ansari
|
||||
bartlett
|
||||
levene
|
||||
shapiro
|
||||
anderson
|
||||
anderson_ksamp
|
||||
binom_test
|
||||
binomtest
|
||||
fligner
|
||||
median_test
|
||||
mood
|
||||
skewtest
|
||||
kurtosistest
|
||||
normaltest
|
||||
goodness_of_fit
|
||||
|
||||
|
||||
Quasi-Monte Carlo
|
||||
=================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
stats.qmc
|
||||
|
||||
Resampling Methods
|
||||
==================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
bootstrap
|
||||
permutation_test
|
||||
monte_carlo_test
|
||||
|
||||
Masked statistics functions
|
||||
===========================
|
||||
|
||||
.. toctree::
|
||||
|
||||
stats.mstats
|
||||
|
||||
|
||||
Other statistical functionality
|
||||
===============================
|
||||
|
||||
Transformations
|
||||
---------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
boxcox
|
||||
boxcox_normmax
|
||||
boxcox_llf
|
||||
yeojohnson
|
||||
yeojohnson_normmax
|
||||
yeojohnson_llf
|
||||
obrientransform
|
||||
sigmaclip
|
||||
trimboth
|
||||
trim1
|
||||
zmap
|
||||
zscore
|
||||
gzscore
|
||||
|
||||
Statistical distances
|
||||
---------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
wasserstein_distance
|
||||
energy_distance
|
||||
|
||||
Sampling
|
||||
--------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
stats.sampling
|
||||
|
||||
Random variate generation / CDF Inversion
|
||||
-----------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rvs_ratio_uniforms
|
||||
|
||||
Distribution Fitting
|
||||
--------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
fit
|
||||
|
||||
Directional statistical functions
|
||||
---------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
directional_stats
|
||||
circmean
|
||||
circvar
|
||||
circstd
|
||||
|
||||
Contingency table functions
|
||||
---------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
chi2_contingency
|
||||
contingency.crosstab
|
||||
contingency.expected_freq
|
||||
contingency.margins
|
||||
contingency.relative_risk
|
||||
contingency.association
|
||||
contingency.odds_ratio
|
||||
fisher_exact
|
||||
barnard_exact
|
||||
boschloo_exact
|
||||
|
||||
Plot-tests
|
||||
----------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ppcc_max
|
||||
ppcc_plot
|
||||
probplot
|
||||
boxcox_normplot
|
||||
yeojohnson_normplot
|
||||
|
||||
Univariate and multivariate kernel density estimation
|
||||
-----------------------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
gaussian_kde
|
||||
|
||||
Warnings / Errors used in :mod:`scipy.stats`
|
||||
--------------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
DegenerateDataWarning
|
||||
ConstantInputWarning
|
||||
NearConstantInputWarning
|
||||
FitError
|
||||
|
||||
"""
|
||||
|
||||
from ._warnings_errors import (ConstantInputWarning, NearConstantInputWarning,
|
||||
DegenerateDataWarning, FitError)
|
||||
from ._stats_py import *
|
||||
from ._variation import variation
|
||||
from .distributions import *
|
||||
from ._morestats import *
|
||||
from ._binomtest import binomtest
|
||||
from ._binned_statistic import *
|
||||
from ._kde import gaussian_kde
|
||||
from . import mstats
|
||||
from . import qmc
|
||||
from ._multivariate import *
|
||||
from . import contingency
|
||||
from .contingency import chi2_contingency
|
||||
from ._resampling import bootstrap, monte_carlo_test, permutation_test
|
||||
from ._entropy import *
|
||||
from ._hypotests import *
|
||||
from ._rvs_sampling import rvs_ratio_uniforms
|
||||
from ._page_trend_test import page_trend_test
|
||||
from ._mannwhitneyu import mannwhitneyu
|
||||
from ._fit import fit, goodness_of_fit
|
||||
from ._covariance import Covariance
|
||||
|
||||
# Deprecated namespaces, to be removed in v2.0.0
|
||||
from . import (
|
||||
biasedurn, kde, morestats, mstats_basic, mstats_extras, mvn, statlib, stats
|
||||
)
|
||||
|
||||
__all__ = [s for s in dir() if not s.startswith("_")] # Remove dunders.
|
||||
|
||||
from scipy._lib._testutils import PytestTester
|
||||
test = PytestTester(__name__)
|
||||
del PytestTester
|
||||
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/__init__.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/__init__.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_axis_nan_policy.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_axis_nan_policy.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_binned_statistic.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_binned_statistic.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_binomtest.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_binomtest.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_common.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_common.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_constants.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_constants.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_continuous_distns.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_continuous_distns.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_covariance.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_covariance.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_crosstab.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_crosstab.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_discrete_distns.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_discrete_distns.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_distn_infrastructure.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_distn_infrastructure.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_distr_params.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_distr_params.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_entropy.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_entropy.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_fit.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_fit.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_generate_pyx.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_generate_pyx.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_hypotests.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_hypotests.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_kde.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_kde.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_ksstats.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_ksstats.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_mannwhitneyu.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_mannwhitneyu.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_morestats.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_morestats.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_mstats_basic.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_mstats_basic.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_mstats_extras.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_mstats_extras.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_multivariate.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_multivariate.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_odds_ratio.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_odds_ratio.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_page_trend_test.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_page_trend_test.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_qmc.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_qmc.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_relative_risk.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_relative_risk.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_resampling.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_resampling.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_result_classes.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_result_classes.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_rvs_sampling.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_rvs_sampling.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_stats_mstats_common.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_stats_mstats_common.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_stats_py.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_stats_py.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_tukeylambda_stats.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_tukeylambda_stats.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_variation.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_variation.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_warnings_errors.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/_warnings_errors.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/biasedurn.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/biasedurn.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/contingency.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/contingency.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/distributions.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/distributions.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/kde.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/kde.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/morestats.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/morestats.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/mstats.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/mstats.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/mstats_basic.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/mstats_basic.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/mstats_extras.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/mstats_extras.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/mvn.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/mvn.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/qmc.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/qmc.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/sampling.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/sampling.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/statlib.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/statlib.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/stats.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/__pycache__/stats.cpython-311.pyc
vendored
Normal file
Binary file not shown.
605
.CondaPkg/env/Lib/site-packages/scipy/stats/_axis_nan_policy.py
vendored
Normal file
605
.CondaPkg/env/Lib/site-packages/scipy/stats/_axis_nan_policy.py
vendored
Normal file
@@ -0,0 +1,605 @@
|
||||
# Many scipy.stats functions support `axis` and `nan_policy` parameters.
|
||||
# When the two are combined, it can be tricky to get all the behavior just
|
||||
# right. This file contains utility functions useful for scipy.stats functions
|
||||
# that support `axis` and `nan_policy`, including a decorator that
|
||||
# automatically adds `axis` and `nan_policy` arguments to a function.
|
||||
|
||||
import numpy as np
|
||||
from functools import wraps
|
||||
from scipy._lib._docscrape import FunctionDoc, Parameter
|
||||
from scipy._lib._util import _contains_nan
|
||||
import inspect
|
||||
|
||||
|
||||
def _broadcast_arrays(arrays, axis=None):
|
||||
"""
|
||||
Broadcast shapes of arrays, ignoring incompatibility of specified axes
|
||||
"""
|
||||
new_shapes = _broadcast_array_shapes(arrays, axis=axis)
|
||||
if axis is None:
|
||||
new_shapes = [new_shapes]*len(arrays)
|
||||
return [np.broadcast_to(array, new_shape)
|
||||
for array, new_shape in zip(arrays, new_shapes)]
|
||||
|
||||
|
||||
def _broadcast_array_shapes(arrays, axis=None):
|
||||
"""
|
||||
Broadcast shapes of arrays, ignoring incompatibility of specified axes
|
||||
"""
|
||||
shapes = [np.asarray(arr).shape for arr in arrays]
|
||||
return _broadcast_shapes(shapes, axis)
|
||||
|
||||
|
||||
def _broadcast_shapes(shapes, axis=None):
|
||||
"""
|
||||
Broadcast shapes, ignoring incompatibility of specified axes
|
||||
"""
|
||||
if not shapes:
|
||||
return shapes
|
||||
|
||||
# input validation
|
||||
if axis is not None:
|
||||
axis = np.atleast_1d(axis)
|
||||
axis_int = axis.astype(int)
|
||||
if not np.array_equal(axis_int, axis):
|
||||
raise np.AxisError('`axis` must be an integer, a '
|
||||
'tuple of integers, or `None`.')
|
||||
axis = axis_int
|
||||
|
||||
# First, ensure all shapes have same number of dimensions by prepending 1s.
|
||||
n_dims = max([len(shape) for shape in shapes])
|
||||
new_shapes = np.ones((len(shapes), n_dims), dtype=int)
|
||||
for row, shape in zip(new_shapes, shapes):
|
||||
row[len(row)-len(shape):] = shape # can't use negative indices (-0:)
|
||||
|
||||
# Remove the shape elements of the axes to be ignored, but remember them.
|
||||
if axis is not None:
|
||||
axis[axis < 0] = n_dims + axis[axis < 0]
|
||||
axis = np.sort(axis)
|
||||
if axis[-1] >= n_dims or axis[0] < 0:
|
||||
message = (f"`axis` is out of bounds "
|
||||
f"for array of dimension {n_dims}")
|
||||
raise np.AxisError(message)
|
||||
|
||||
if len(np.unique(axis)) != len(axis):
|
||||
raise np.AxisError("`axis` must contain only distinct elements")
|
||||
|
||||
removed_shapes = new_shapes[:, axis]
|
||||
new_shapes = np.delete(new_shapes, axis, axis=1)
|
||||
|
||||
# If arrays are broadcastable, shape elements that are 1 may be replaced
|
||||
# with a corresponding non-1 shape element. Assuming arrays are
|
||||
# broadcastable, that final shape element can be found with:
|
||||
new_shape = np.max(new_shapes, axis=0)
|
||||
# except in case of an empty array:
|
||||
new_shape *= new_shapes.all(axis=0)
|
||||
|
||||
# Among all arrays, there can only be one unique non-1 shape element.
|
||||
# Therefore, if any non-1 shape element does not match what we found
|
||||
# above, the arrays must not be broadcastable after all.
|
||||
if np.any(~((new_shapes == 1) | (new_shapes == new_shape))):
|
||||
raise ValueError("Array shapes are incompatible for broadcasting.")
|
||||
|
||||
if axis is not None:
|
||||
# Add back the shape elements that were ignored
|
||||
new_axis = axis - np.arange(len(axis))
|
||||
new_shapes = [tuple(np.insert(new_shape, new_axis, removed_shape))
|
||||
for removed_shape in removed_shapes]
|
||||
return new_shapes
|
||||
else:
|
||||
return tuple(new_shape)
|
||||
|
||||
|
||||
def _broadcast_array_shapes_remove_axis(arrays, axis=None):
|
||||
"""
|
||||
Broadcast shapes of arrays, dropping specified axes
|
||||
|
||||
Given a sequence of arrays `arrays` and an integer or tuple `axis`, find
|
||||
the shape of the broadcast result after consuming/dropping `axis`.
|
||||
In other words, return output shape of a typical hypothesis test on
|
||||
`arrays` vectorized along `axis`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> a = np.zeros((5, 2, 1))
|
||||
>>> b = np.zeros((9, 3))
|
||||
>>> _broadcast_array_shapes((a, b), 1)
|
||||
(5, 3)
|
||||
"""
|
||||
# Note that here, `axis=None` means do not consume/drop any axes - _not_
|
||||
# ravel arrays before broadcasting.
|
||||
shapes = [arr.shape for arr in arrays]
|
||||
return _broadcast_shapes_remove_axis(shapes, axis)
|
||||
|
||||
|
||||
def _broadcast_shapes_remove_axis(shapes, axis=None):
|
||||
"""
|
||||
Broadcast shapes, dropping specified axes
|
||||
|
||||
Same as _broadcast_array_shapes, but given a sequence
|
||||
of array shapes `shapes` instead of the arrays themselves.
|
||||
"""
|
||||
shapes = _broadcast_shapes(shapes, axis)
|
||||
shape = shapes[0]
|
||||
if axis is not None:
|
||||
shape = np.delete(shape, axis)
|
||||
return tuple(shape)
|
||||
|
||||
|
||||
def _broadcast_concatenate(arrays, axis):
|
||||
"""Concatenate arrays along an axis with broadcasting."""
|
||||
arrays = _broadcast_arrays(arrays, axis)
|
||||
res = np.concatenate(arrays, axis=axis)
|
||||
return res
|
||||
|
||||
|
||||
# TODO: add support for `axis` tuples
|
||||
def _remove_nans(samples, paired):
|
||||
"Remove nans from paired or unpaired 1D samples"
|
||||
# potential optimization: don't copy arrays that don't contain nans
|
||||
if not paired:
|
||||
return [sample[~np.isnan(sample)] for sample in samples]
|
||||
|
||||
# for paired samples, we need to remove the whole pair when any part
|
||||
# has a nan
|
||||
nans = np.isnan(samples[0])
|
||||
for sample in samples[1:]:
|
||||
nans = nans | np.isnan(sample)
|
||||
not_nans = ~nans
|
||||
return [sample[not_nans] for sample in samples]
|
||||
|
||||
|
||||
def _remove_sentinel(samples, paired, sentinel):
|
||||
"Remove sentinel values from paired or unpaired 1D samples"
|
||||
# could consolidate with `_remove_nans`, but it's not quite as simple as
|
||||
# passing `sentinel=np.nan` because `(np.nan == np.nan) is False`
|
||||
|
||||
# potential optimization: don't copy arrays that don't contain sentinel
|
||||
if not paired:
|
||||
return [sample[sample != sentinel] for sample in samples]
|
||||
|
||||
# for paired samples, we need to remove the whole pair when any part
|
||||
# has a nan
|
||||
sentinels = (samples[0] == sentinel)
|
||||
for sample in samples[1:]:
|
||||
sentinels = sentinels | (sample == sentinel)
|
||||
not_sentinels = ~sentinels
|
||||
return [sample[not_sentinels] for sample in samples]
|
||||
|
||||
|
||||
def _masked_arrays_2_sentinel_arrays(samples):
|
||||
# masked arrays in `samples` are converted to regular arrays, and values
|
||||
# corresponding with masked elements are replaced with a sentinel value
|
||||
|
||||
# return without modifying arrays if none have a mask
|
||||
has_mask = False
|
||||
for sample in samples:
|
||||
mask = getattr(sample, 'mask', False)
|
||||
has_mask = has_mask or np.any(mask)
|
||||
if not has_mask:
|
||||
return samples, None # None means there is no sentinel value
|
||||
|
||||
# Choose a sentinel value. We can't use `np.nan`, because sentinel (masked)
|
||||
# values are always omitted, but there are different nan policies.
|
||||
dtype = np.result_type(*samples)
|
||||
dtype = dtype if np.issubdtype(dtype, np.number) else np.float64
|
||||
for i in range(len(samples)):
|
||||
# Things get more complicated if the arrays are of different types.
|
||||
# We could have different sentinel values for each array, but
|
||||
# the purpose of this code is convenience, not efficiency.
|
||||
samples[i] = samples[i].astype(dtype, copy=False)
|
||||
|
||||
inexact = np.issubdtype(dtype, np.inexact)
|
||||
info = np.finfo if inexact else np.iinfo
|
||||
max_possible, min_possible = info(dtype).max, info(dtype).min
|
||||
nextafter = np.nextafter if inexact else (lambda x, _: x - 1)
|
||||
|
||||
sentinel = max_possible
|
||||
# For simplicity, min_possible/np.infs are not candidate sentinel values
|
||||
while sentinel > min_possible:
|
||||
for sample in samples:
|
||||
if np.any(sample == sentinel): # choose a new sentinel value
|
||||
sentinel = nextafter(sentinel, -np.inf)
|
||||
break
|
||||
else: # when sentinel value is OK, break the while loop
|
||||
break
|
||||
else:
|
||||
message = ("This function replaces masked elements with sentinel "
|
||||
"values, but the data contains all distinct values of this "
|
||||
"data type. Consider promoting the dtype to `np.float64`.")
|
||||
raise ValueError(message)
|
||||
|
||||
# replace masked elements with sentinel value
|
||||
out_samples = []
|
||||
for sample in samples:
|
||||
mask = getattr(sample, 'mask', None)
|
||||
if mask is not None: # turn all masked arrays into sentinel arrays
|
||||
mask = np.broadcast_to(mask, sample.shape)
|
||||
sample = sample.data.copy() if np.any(mask) else sample.data
|
||||
sample = np.asarray(sample) # `sample.data` could be a memoryview?
|
||||
sample[mask] = sentinel
|
||||
out_samples.append(sample)
|
||||
|
||||
return out_samples, sentinel
|
||||
|
||||
|
||||
def _check_empty_inputs(samples, axis):
|
||||
"""
|
||||
Check for empty sample; return appropriate output for a vectorized hypotest
|
||||
"""
|
||||
# if none of the samples are empty, we need to perform the test
|
||||
if not any((sample.size == 0 for sample in samples)):
|
||||
return None
|
||||
# otherwise, the statistic and p-value will be either empty arrays or
|
||||
# arrays with NaNs. Produce the appropriate array and return it.
|
||||
output_shape = _broadcast_array_shapes_remove_axis(samples, axis)
|
||||
output = np.ones(output_shape) * np.nan
|
||||
return output
|
||||
|
||||
|
||||
def _add_reduced_axes(res, reduced_axes, keepdims):
|
||||
"""
|
||||
Add reduced axes back to all the arrays in the result object
|
||||
if keepdims = True.
|
||||
"""
|
||||
return ([np.expand_dims(output, reduced_axes) for output in res]
|
||||
if keepdims else res)
|
||||
|
||||
|
||||
# Standard docstring / signature entries for `axis`, `nan_policy`, `keepdims`
|
||||
_name = 'axis'
|
||||
_desc = (
|
||||
"""If an int, the axis of the input along which to compute the statistic.
|
||||
The statistic of each axis-slice (e.g. row) of the input will appear in a
|
||||
corresponding element of the output.
|
||||
If ``None``, the input will be raveled before computing the statistic."""
|
||||
.split('\n'))
|
||||
|
||||
|
||||
def _get_axis_params(default_axis=0, _name=_name, _desc=_desc): # bind NOW
|
||||
_type = f"int or None, default: {default_axis}"
|
||||
_axis_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_axis_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default=default_axis)
|
||||
return _axis_parameter_doc, _axis_parameter
|
||||
|
||||
|
||||
_name = 'nan_policy'
|
||||
_type = "{'propagate', 'omit', 'raise'}"
|
||||
_desc = (
|
||||
"""Defines how to handle input NaNs.
|
||||
|
||||
- ``propagate``: if a NaN is present in the axis slice (e.g. row) along
|
||||
which the statistic is computed, the corresponding entry of the output
|
||||
will be NaN.
|
||||
- ``omit``: NaNs will be omitted when performing the calculation.
|
||||
If insufficient data remains in the axis slice along which the
|
||||
statistic is computed, the corresponding entry of the output will be
|
||||
NaN.
|
||||
- ``raise``: if a NaN is present, a ``ValueError`` will be raised."""
|
||||
.split('\n'))
|
||||
_nan_policy_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_nan_policy_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default='propagate')
|
||||
|
||||
_name = 'keepdims'
|
||||
_type = "bool, default: False"
|
||||
_desc = (
|
||||
"""If this is set to True, the axes which are reduced are left
|
||||
in the result as dimensions with size one. With this option,
|
||||
the result will broadcast correctly against the input array."""
|
||||
.split('\n'))
|
||||
_keepdims_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_keepdims_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default=False)
|
||||
|
||||
_standard_note_addition = (
|
||||
"""\nBeginning in SciPy 1.9, ``np.matrix`` inputs (not recommended for new
|
||||
code) are converted to ``np.ndarray`` before the calculation is performed. In
|
||||
this case, the output will be a scalar or ``np.ndarray`` of appropriate shape
|
||||
rather than a 2D ``np.matrix``. Similarly, while masked elements of masked
|
||||
arrays are ignored, the output will be a scalar or ``np.ndarray`` rather than a
|
||||
masked array with ``mask=False``.""").split('\n')
|
||||
|
||||
|
||||
def _axis_nan_policy_factory(tuple_to_result, default_axis=0,
|
||||
n_samples=1, paired=False,
|
||||
result_to_tuple=None, too_small=0,
|
||||
n_outputs=2, kwd_samples=[]):
|
||||
"""Factory for a wrapper that adds axis/nan_policy params to a function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tuple_to_result : callable
|
||||
Callable that returns an object of the type returned by the function
|
||||
being wrapped (e.g. the namedtuple or dataclass returned by a
|
||||
statistical test) provided the separate components (e.g. statistic,
|
||||
pvalue).
|
||||
default_axis : int, default: 0
|
||||
The default value of the axis argument. Standard is 0 except when
|
||||
backwards compatibility demands otherwise (e.g. `None`).
|
||||
n_samples : int or callable, default: 1
|
||||
The number of data samples accepted by the function
|
||||
(e.g. `mannwhitneyu`), a callable that accepts a dictionary of
|
||||
parameters passed into the function and returns the number of data
|
||||
samples (e.g. `wilcoxon`), or `None` to indicate an arbitrary number
|
||||
of samples (e.g. `kruskal`).
|
||||
paired : {False, True}
|
||||
Whether the function being wrapped treats the samples as paired (i.e.
|
||||
corresponding elements of each sample should be considered as different
|
||||
components of the same sample.)
|
||||
result_to_tuple : callable, optional
|
||||
Function that unpacks the results of the function being wrapped into
|
||||
a tuple. This is essentially the inverse of `tuple_to_result`. Default
|
||||
is `None`, which is appropriate for statistical tests that return a
|
||||
statistic, pvalue tuple (rather than, e.g., a non-iterable datalass).
|
||||
too_small : int, default: 0
|
||||
The largest unnacceptably small sample for the function being wrapped.
|
||||
For example, some functions require samples of size two or more or they
|
||||
raise an error. This argument prevents the error from being raised when
|
||||
input is not 1D and instead places a NaN in the corresponding element
|
||||
of the result.
|
||||
n_outputs : int or callable, default: 2
|
||||
The number of outputs produced by the function given 1d sample(s). For
|
||||
example, hypothesis tests that return a namedtuple or result object
|
||||
with attributes ``statistic`` and ``pvalue`` use the default
|
||||
``n_outputs=2``; summary statistics with scalar output use
|
||||
``n_outputs=1``. Alternatively, may be a callable that accepts a
|
||||
dictionary of arguments passed into the wrapped function and returns
|
||||
the number of outputs corresponding with those arguments.
|
||||
kwd_samples : sequence, default: []
|
||||
The names of keyword parameters that should be treated as samples. For
|
||||
example, `gmean` accepts as its first argument a sample `a` but
|
||||
also `weights` as a fourth, optional keyword argument. In this case, we
|
||||
use `n_samples=1` and kwd_samples=['weights'].
|
||||
"""
|
||||
|
||||
if result_to_tuple is None:
|
||||
def result_to_tuple(res):
|
||||
return res
|
||||
|
||||
def is_too_small(samples):
|
||||
for sample in samples:
|
||||
if len(sample) <= too_small:
|
||||
return True
|
||||
return False
|
||||
|
||||
def axis_nan_policy_decorator(hypotest_fun_in):
|
||||
@wraps(hypotest_fun_in)
|
||||
def axis_nan_policy_wrapper(*args, _no_deco=False, **kwds):
|
||||
|
||||
if _no_deco: # for testing, decorator does nothing
|
||||
return hypotest_fun_in(*args, **kwds)
|
||||
|
||||
# We need to be flexible about whether position or keyword
|
||||
# arguments are used, but we need to make sure users don't pass
|
||||
# both for the same parameter. To complicate matters, some
|
||||
# functions accept samples with *args, and some functions already
|
||||
# accept `axis` and `nan_policy` as positional arguments.
|
||||
# The strategy is to make sure that there is no duplication
|
||||
# between `args` and `kwds`, combine the two into `kwds`, then
|
||||
# the samples, `nan_policy`, and `axis` from `kwds`, as they are
|
||||
# dealt with separately.
|
||||
|
||||
# Check for intersection between positional and keyword args
|
||||
params = list(inspect.signature(hypotest_fun_in).parameters)
|
||||
if n_samples is None:
|
||||
# Give unique names to each positional sample argument
|
||||
# Note that *args can't be provided as a keyword argument
|
||||
params = [f"arg{i}" for i in range(len(args))] + params[1:]
|
||||
|
||||
d_args = dict(zip(params, args))
|
||||
intersection = set(d_args) & set(kwds)
|
||||
if intersection:
|
||||
message = (f"{hypotest_fun_in.__name__}() got multiple values "
|
||||
f"for argument '{list(intersection)[0]}'")
|
||||
raise TypeError(message)
|
||||
|
||||
# Consolidate other positional and keyword args into `kwds`
|
||||
kwds.update(d_args)
|
||||
|
||||
# rename avoids UnboundLocalError
|
||||
if callable(n_samples):
|
||||
# Future refactoring idea: no need for callable n_samples.
|
||||
# Just replace `n_samples` and `kwd_samples` with a single
|
||||
# list of the names of all samples, and treat all of them
|
||||
# as `kwd_samples` are treated below.
|
||||
n_samp = n_samples(kwds)
|
||||
else:
|
||||
n_samp = n_samples or len(args)
|
||||
|
||||
# get the number of outputs
|
||||
n_out = n_outputs # rename to avoid UnboundLocalError
|
||||
if callable(n_out):
|
||||
n_out = n_out(kwds)
|
||||
|
||||
# If necessary, rearrange function signature: accept other samples
|
||||
# as positional args right after the first n_samp args
|
||||
kwd_samp = [name for name in kwd_samples
|
||||
if kwds.get(name, None) is not None]
|
||||
n_kwd_samp = len(kwd_samp)
|
||||
if not kwd_samp:
|
||||
hypotest_fun_out = hypotest_fun_in
|
||||
else:
|
||||
def hypotest_fun_out(*samples, **kwds):
|
||||
new_kwds = dict(zip(kwd_samp, samples[n_samp:]))
|
||||
kwds.update(new_kwds)
|
||||
return hypotest_fun_in(*samples[:n_samp], **kwds)
|
||||
|
||||
# Extract the things we need here
|
||||
samples = [np.atleast_1d(kwds.pop(param))
|
||||
for param in (params[:n_samp] + kwd_samp)]
|
||||
vectorized = True if 'axis' in params else False
|
||||
axis = kwds.pop('axis', default_axis)
|
||||
nan_policy = kwds.pop('nan_policy', 'propagate')
|
||||
keepdims = kwds.pop("keepdims", False)
|
||||
del args # avoid the possibility of passing both `args` and `kwds`
|
||||
|
||||
# convert masked arrays to regular arrays with sentinel values
|
||||
samples, sentinel = _masked_arrays_2_sentinel_arrays(samples)
|
||||
|
||||
# standardize to always work along last axis
|
||||
reduced_axes = axis
|
||||
if axis is None:
|
||||
if samples:
|
||||
# when axis=None, take the maximum of all dimensions since
|
||||
# all the dimensions are reduced.
|
||||
n_dims = np.max([sample.ndim for sample in samples])
|
||||
reduced_axes = tuple(range(n_dims))
|
||||
samples = [np.asarray(sample.ravel()) for sample in samples]
|
||||
else:
|
||||
samples = _broadcast_arrays(samples, axis=axis)
|
||||
axis = np.atleast_1d(axis)
|
||||
n_axes = len(axis)
|
||||
# move all axes in `axis` to the end to be raveled
|
||||
samples = [np.moveaxis(sample, axis, range(-len(axis), 0))
|
||||
for sample in samples]
|
||||
shapes = [sample.shape for sample in samples]
|
||||
# New shape is unchanged for all axes _not_ in `axis`
|
||||
# At the end, we append the product of the shapes of the axes
|
||||
# in `axis`. Appending -1 doesn't work for zero-size arrays!
|
||||
new_shapes = [shape[:-n_axes] + (np.prod(shape[-n_axes:]),)
|
||||
for shape in shapes]
|
||||
samples = [sample.reshape(new_shape)
|
||||
for sample, new_shape in zip(samples, new_shapes)]
|
||||
axis = -1 # work over the last axis
|
||||
|
||||
# if axis is not needed, just handle nan_policy and return
|
||||
ndims = np.array([sample.ndim for sample in samples])
|
||||
if np.all(ndims <= 1):
|
||||
# Addresses nan_policy == "raise"
|
||||
contains_nans = []
|
||||
for sample in samples:
|
||||
contains_nan, _ = _contains_nan(sample, nan_policy)
|
||||
contains_nans.append(contains_nan)
|
||||
|
||||
# Addresses nan_policy == "propagate"
|
||||
# Consider adding option to let function propagate nans, but
|
||||
# currently the hypothesis tests this is applied to do not
|
||||
# propagate nans in a sensible way
|
||||
if any(contains_nans) and nan_policy == 'propagate':
|
||||
res = np.full(n_out, np.nan)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# Addresses nan_policy == "omit"
|
||||
if any(contains_nans) and nan_policy == 'omit':
|
||||
# consider passing in contains_nans
|
||||
samples = _remove_nans(samples, paired)
|
||||
|
||||
# ideally, this is what the behavior would be:
|
||||
# if is_too_small(samples):
|
||||
# return tuple_to_result(np.nan, np.nan)
|
||||
# but some existing functions raise exceptions, and changing
|
||||
# behavior of those would break backward compatibility.
|
||||
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
res = hypotest_fun_out(*samples, **kwds)
|
||||
res = result_to_tuple(res)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# check for empty input
|
||||
# ideally, move this to the top, but some existing functions raise
|
||||
# exceptions for empty input, so overriding it would break
|
||||
# backward compatibility.
|
||||
empty_output = _check_empty_inputs(samples, axis)
|
||||
if empty_output is not None:
|
||||
res = [empty_output.copy() for i in range(n_out)]
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# otherwise, concatenate all samples along axis, remembering where
|
||||
# each separate sample begins
|
||||
lengths = np.array([sample.shape[axis] for sample in samples])
|
||||
split_indices = np.cumsum(lengths)
|
||||
x = _broadcast_concatenate(samples, axis)
|
||||
|
||||
# Addresses nan_policy == "raise"
|
||||
contains_nan, _ = _contains_nan(x, nan_policy)
|
||||
|
||||
if vectorized and not contains_nan and not sentinel:
|
||||
res = hypotest_fun_out(*samples, axis=axis, **kwds)
|
||||
res = result_to_tuple(res)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# Addresses nan_policy == "omit"
|
||||
if contains_nan and nan_policy == 'omit':
|
||||
def hypotest_fun(x):
|
||||
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
|
||||
samples = _remove_nans(samples, paired)
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
if is_too_small(samples):
|
||||
return np.full(n_out, np.nan)
|
||||
return result_to_tuple(hypotest_fun_out(*samples, **kwds))
|
||||
|
||||
# Addresses nan_policy == "propagate"
|
||||
elif contains_nan and nan_policy == 'propagate':
|
||||
def hypotest_fun(x):
|
||||
if np.isnan(x).any():
|
||||
return np.full(n_out, np.nan)
|
||||
|
||||
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
if is_too_small(samples):
|
||||
return np.full(n_out, np.nan)
|
||||
return result_to_tuple(hypotest_fun_out(*samples, **kwds))
|
||||
|
||||
else:
|
||||
def hypotest_fun(x):
|
||||
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
if is_too_small(samples):
|
||||
return np.full(n_out, np.nan)
|
||||
return result_to_tuple(hypotest_fun_out(*samples, **kwds))
|
||||
|
||||
x = np.moveaxis(x, axis, 0)
|
||||
res = np.apply_along_axis(hypotest_fun, axis=0, arr=x)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
_axis_parameter_doc, _axis_parameter = _get_axis_params(default_axis)
|
||||
doc = FunctionDoc(axis_nan_policy_wrapper)
|
||||
parameter_names = [param.name for param in doc['Parameters']]
|
||||
if 'axis' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('axis')] = (
|
||||
_axis_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_axis_parameter_doc)
|
||||
if 'nan_policy' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('nan_policy')] = (
|
||||
_nan_policy_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_nan_policy_parameter_doc)
|
||||
if 'keepdims' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('keepdims')] = (
|
||||
_keepdims_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_keepdims_parameter_doc)
|
||||
doc['Notes'] += _standard_note_addition
|
||||
doc = str(doc).split("\n", 1)[1] # remove signature
|
||||
axis_nan_policy_wrapper.__doc__ = str(doc)
|
||||
|
||||
sig = inspect.signature(axis_nan_policy_wrapper)
|
||||
parameters = sig.parameters
|
||||
parameter_list = list(parameters.values())
|
||||
if 'axis' not in parameters:
|
||||
parameter_list.append(_axis_parameter)
|
||||
if 'nan_policy' not in parameters:
|
||||
parameter_list.append(_nan_policy_parameter)
|
||||
if 'keepdims' not in parameters:
|
||||
parameter_list.append(_keepdims_parameter)
|
||||
sig = sig.replace(parameters=parameter_list)
|
||||
axis_nan_policy_wrapper.__signature__ = sig
|
||||
|
||||
return axis_nan_policy_wrapper
|
||||
return axis_nan_policy_decorator
|
||||
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_biasedurn.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_biasedurn.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_biasedurn.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_biasedurn.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
27
.CondaPkg/env/Lib/site-packages/scipy/stats/_biasedurn.pxd
vendored
Normal file
27
.CondaPkg/env/Lib/site-packages/scipy/stats/_biasedurn.pxd
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
# Declare the class with cdef
|
||||
cdef extern from "biasedurn/stocc.h" nogil:
|
||||
cdef cppclass CFishersNCHypergeometric:
|
||||
CFishersNCHypergeometric(int, int, int, double, double) except +
|
||||
int mode()
|
||||
double mean()
|
||||
double variance()
|
||||
double probability(int x)
|
||||
double moments(double * mean, double * var)
|
||||
|
||||
cdef cppclass CWalleniusNCHypergeometric:
|
||||
CWalleniusNCHypergeometric() except +
|
||||
CWalleniusNCHypergeometric(int, int, int, double, double) except +
|
||||
int mode()
|
||||
double mean()
|
||||
double variance()
|
||||
double probability(int x)
|
||||
double moments(double * mean, double * var)
|
||||
|
||||
cdef cppclass StochasticLib3:
|
||||
StochasticLib3(int seed) except +
|
||||
double Random() except +
|
||||
void SetAccuracy(double accur)
|
||||
int FishersNCHyp (int n, int m, int N, double odds) except +
|
||||
int WalleniusNCHyp (int n, int m, int N, double odds) except +
|
||||
double(*next_double)()
|
||||
double(*next_normal)(const double m, const double s)
|
||||
795
.CondaPkg/env/Lib/site-packages/scipy/stats/_binned_statistic.py
vendored
Normal file
795
.CondaPkg/env/Lib/site-packages/scipy/stats/_binned_statistic.py
vendored
Normal file
@@ -0,0 +1,795 @@
|
||||
import builtins
|
||||
import numpy as np
|
||||
from numpy.testing import suppress_warnings
|
||||
from operator import index
|
||||
from collections import namedtuple
|
||||
|
||||
__all__ = ['binned_statistic',
|
||||
'binned_statistic_2d',
|
||||
'binned_statistic_dd']
|
||||
|
||||
|
||||
BinnedStatisticResult = namedtuple('BinnedStatisticResult',
|
||||
('statistic', 'bin_edges', 'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic(x, values, statistic='mean',
|
||||
bins=10, range=None):
|
||||
"""
|
||||
Compute a binned statistic for one or more sets of data.
|
||||
|
||||
This is a generalization of a histogram function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values (or set of values) within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `x`, or a set of sequences - each the same shape as
|
||||
`x`. If `values` is a set of sequences, the statistic will be computed
|
||||
on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or sequence of scalars, optional
|
||||
If `bins` is an int, it defines the number of equal-width bins in the
|
||||
given range (10 by default). If `bins` is a sequence, it defines the
|
||||
bin edges, including the rightmost edge, allowing for non-uniform bin
|
||||
widths. Values in `x` that are smaller than lowest bin edge are
|
||||
assigned to bin number 0, values beyond the highest bin are assigned to
|
||||
``bins[-1]``. If the bin edges are specified, the number of bins will
|
||||
be, (nx = len(bins)-1).
|
||||
range : (float, float) or [(float, float)], optional
|
||||
The lower and upper range of the bins. If not provided, range
|
||||
is simply ``(x.min(), x.max())``. Values outside the range are
|
||||
ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : array
|
||||
The values of the selected statistic in each bin.
|
||||
bin_edges : array of dtype float
|
||||
Return the bin edges ``(length(statistic)+1)``.
|
||||
binnumber: 1-D ndarray of ints
|
||||
Indices of the bins (corresponding to `bin_edges`) in which each value
|
||||
of `x` belongs. Same length as `values`. A binnumber of `i` means the
|
||||
corresponding value is between (bin_edges[i-1], bin_edges[i]).
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
|
||||
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
|
||||
``[3, 4]``, which *includes* 4.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
First some basic examples:
|
||||
|
||||
Create two evenly spaced bins in the range of the given sample, and sum the
|
||||
corresponding values in each of those bins:
|
||||
|
||||
>>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
|
||||
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
|
||||
BinnedStatisticResult(statistic=array([4. , 4.5]),
|
||||
bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
|
||||
|
||||
Multiple arrays of values can also be passed. The statistic is calculated
|
||||
on each set independently:
|
||||
|
||||
>>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
|
||||
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
|
||||
BinnedStatisticResult(statistic=array([[4. , 4.5],
|
||||
[8. , 9. ]]), bin_edges=array([1., 4., 7.]),
|
||||
binnumber=array([1, 1, 1, 2, 2]))
|
||||
|
||||
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
|
||||
... bins=3)
|
||||
BinnedStatisticResult(statistic=array([1., 2., 4.]),
|
||||
bin_edges=array([1., 2., 3., 4.]),
|
||||
binnumber=array([1, 2, 1, 2, 3]))
|
||||
|
||||
As a second example, we now generate some random data of sailing boat speed
|
||||
as a function of wind speed, and then determine how fast our boat is for
|
||||
certain wind speeds:
|
||||
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> windspeed = 8 * rng.random(500)
|
||||
>>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
|
||||
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
|
||||
... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
|
||||
>>> plt.figure()
|
||||
>>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
|
||||
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
|
||||
... label='binned statistic of data')
|
||||
>>> plt.legend()
|
||||
|
||||
Now we can use ``binnumber`` to select all datapoints with a windspeed
|
||||
below 1:
|
||||
|
||||
>>> low_boatspeed = boatspeed[binnumber == 0]
|
||||
|
||||
As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
|
||||
plot of a distribution that shows the mean and distribution around that
|
||||
mean per bin, on top of a regular histogram and the probability
|
||||
distribution function:
|
||||
|
||||
>>> x = np.linspace(0, 5, num=500)
|
||||
>>> x_pdf = stats.maxwell.pdf(x)
|
||||
>>> samples = stats.maxwell.rvs(size=10000)
|
||||
|
||||
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
|
||||
... statistic='mean', bins=25)
|
||||
>>> bin_width = (bin_edges[1] - bin_edges[0])
|
||||
>>> bin_centers = bin_edges[1:] - bin_width/2
|
||||
|
||||
>>> plt.figure()
|
||||
>>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
|
||||
... alpha=0.2, label='histogram of data')
|
||||
>>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
|
||||
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
|
||||
... label='binned statistic of data')
|
||||
>>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
|
||||
>>> plt.legend(fontsize=10)
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1:
|
||||
bins = [np.asarray(bins, float)]
|
||||
|
||||
if range is not None:
|
||||
if len(range) == 2:
|
||||
range = [range]
|
||||
|
||||
medians, edges, binnumbers = binned_statistic_dd(
|
||||
[x], values, statistic, bins, range)
|
||||
|
||||
return BinnedStatisticResult(medians, edges[0], binnumbers)
|
||||
|
||||
|
||||
BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
|
||||
('statistic', 'x_edge', 'y_edge',
|
||||
'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic_2d(x, y, values, statistic='mean',
|
||||
bins=10, range=None, expand_binnumbers=False):
|
||||
"""
|
||||
Compute a bidimensional binned statistic for one or more sets of data.
|
||||
|
||||
This is a generalization of a histogram2d function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values (or set of values) within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned along the first dimension.
|
||||
y : (N,) array_like
|
||||
A sequence of values to be binned along the second dimension.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `x`, or a list of sequences - each with the same
|
||||
shape as `x`. If `values` is such a list, the statistic will be
|
||||
computed on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or [int, int] or array_like or [array, array], optional
|
||||
The bin specification:
|
||||
|
||||
* the number of bins for the two dimensions (nx = ny = bins),
|
||||
* the number of bins in each dimension (nx, ny = bins),
|
||||
* the bin edges for the two dimensions (x_edge = y_edge = bins),
|
||||
* the bin edges in each dimension (x_edge, y_edge = bins).
|
||||
|
||||
If the bin edges are specified, the number of bins will be,
|
||||
(nx = len(x_edge)-1, ny = len(y_edge)-1).
|
||||
|
||||
range : (2,2) array_like, optional
|
||||
The leftmost and rightmost edges of the bins along each dimension
|
||||
(if not specified explicitly in the `bins` parameters):
|
||||
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
|
||||
considered outliers and not tallied in the histogram.
|
||||
expand_binnumbers : bool, optional
|
||||
'False' (default): the returned `binnumber` is a shape (N,) array of
|
||||
linearized bin indices.
|
||||
'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
|
||||
ndarray, where each row gives the bin numbers in the corresponding
|
||||
dimension.
|
||||
See the `binnumber` returned value, and the `Examples` section.
|
||||
|
||||
.. versionadded:: 0.17.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : (nx, ny) ndarray
|
||||
The values of the selected statistic in each two-dimensional bin.
|
||||
x_edge : (nx + 1) ndarray
|
||||
The bin edges along the first dimension.
|
||||
y_edge : (ny + 1) ndarray
|
||||
The bin edges along the second dimension.
|
||||
binnumber : (N,) array of ints or (2,N) ndarray of ints
|
||||
This assigns to each element of `sample` an integer that represents the
|
||||
bin in which this observation falls. The representation depends on the
|
||||
`expand_binnumbers` argument. See `Notes` for details.
|
||||
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
Binedges:
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
|
||||
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
|
||||
``[3, 4]``, which *includes* 4.
|
||||
|
||||
`binnumber`:
|
||||
This returned argument assigns to each element of `sample` an integer that
|
||||
represents the bin in which it belongs. The representation depends on the
|
||||
`expand_binnumbers` argument. If 'False' (default): The returned
|
||||
`binnumber` is a shape (N,) array of linearized indices mapping each
|
||||
element of `sample` to its corresponding bin (using row-major ordering).
|
||||
Note that the returned linearized bin indices are used for an array with
|
||||
extra bins on the outer binedges to capture values outside of the defined
|
||||
bin bounds.
|
||||
If 'True': The returned `binnumber` is a shape (2,N) ndarray where
|
||||
each row indicates bin placements for each dimension respectively. In each
|
||||
dimension, a binnumber of `i` means the corresponding value is between
|
||||
(D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
|
||||
Calculate the counts with explicit bin-edges:
|
||||
|
||||
>>> x = [0.1, 0.1, 0.1, 0.6]
|
||||
>>> y = [2.1, 2.6, 2.1, 2.1]
|
||||
>>> binx = [0.0, 0.5, 1.0]
|
||||
>>> biny = [2.0, 2.5, 3.0]
|
||||
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
|
||||
>>> ret.statistic
|
||||
array([[2., 1.],
|
||||
[1., 0.]])
|
||||
|
||||
The bin in which each sample is placed is given by the `binnumber`
|
||||
returned parameter. By default, these are the linearized bin indices:
|
||||
|
||||
>>> ret.binnumber
|
||||
array([5, 6, 5, 9])
|
||||
|
||||
The bin indices can also be expanded into separate entries for each
|
||||
dimension using the `expand_binnumbers` parameter:
|
||||
|
||||
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
|
||||
... expand_binnumbers=True)
|
||||
>>> ret.binnumber
|
||||
array([[1, 1, 1, 2],
|
||||
[1, 2, 1, 1]])
|
||||
|
||||
Which shows that the first three elements belong in the xbin 1, and the
|
||||
fourth into xbin 2; and so on for y.
|
||||
|
||||
"""
|
||||
|
||||
# This code is based on np.histogram2d
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1 and N != 2:
|
||||
xedges = yedges = np.asarray(bins, float)
|
||||
bins = [xedges, yedges]
|
||||
|
||||
medians, edges, binnumbers = binned_statistic_dd(
|
||||
[x, y], values, statistic, bins, range,
|
||||
expand_binnumbers=expand_binnumbers)
|
||||
|
||||
return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
|
||||
|
||||
|
||||
BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
|
||||
('statistic', 'bin_edges',
|
||||
'binnumber'))
|
||||
|
||||
|
||||
def _bincount(x, weights):
|
||||
if np.iscomplexobj(weights):
|
||||
a = np.bincount(x, np.real(weights))
|
||||
b = np.bincount(x, np.imag(weights))
|
||||
z = a + b*1j
|
||||
|
||||
else:
|
||||
z = np.bincount(x, weights)
|
||||
return z
|
||||
|
||||
|
||||
def binned_statistic_dd(sample, values, statistic='mean',
|
||||
bins=10, range=None, expand_binnumbers=False,
|
||||
binned_statistic_result=None):
|
||||
"""
|
||||
Compute a multidimensional binned statistic for a set of data.
|
||||
|
||||
This is a generalization of a histogramdd function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample : array_like
|
||||
Data to histogram passed as a sequence of N arrays of length D, or
|
||||
as an (N,D) array.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `sample`, or a list of sequences - each with the
|
||||
same shape as `sample`. If `values` is such a list, the statistic
|
||||
will be computed on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0. If the number of values
|
||||
within a given bin is 0 or 1, the computed standard deviation value
|
||||
will be 0 for the bin.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : sequence or positive int, optional
|
||||
The bin specification must be in one of the following forms:
|
||||
|
||||
* A sequence of arrays describing the bin edges along each dimension.
|
||||
* The number of bins for each dimension (nx, ny, ... = bins).
|
||||
* The number of bins for all dimensions (nx = ny = ... = bins).
|
||||
range : sequence, optional
|
||||
A sequence of lower and upper bin edges to be used if the edges are
|
||||
not given explicitly in `bins`. Defaults to the minimum and maximum
|
||||
values along each dimension.
|
||||
expand_binnumbers : bool, optional
|
||||
'False' (default): the returned `binnumber` is a shape (N,) array of
|
||||
linearized bin indices.
|
||||
'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
|
||||
ndarray, where each row gives the bin numbers in the corresponding
|
||||
dimension.
|
||||
See the `binnumber` returned value, and the `Examples` section of
|
||||
`binned_statistic_2d`.
|
||||
binned_statistic_result : binnedStatisticddResult
|
||||
Result of a previous call to the function in order to reuse bin edges
|
||||
and bin numbers with new values and/or a different statistic.
|
||||
To reuse bin numbers, `expand_binnumbers` must have been set to False
|
||||
(the default)
|
||||
|
||||
.. versionadded:: 0.17.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : ndarray, shape(nx1, nx2, nx3,...)
|
||||
The values of the selected statistic in each two-dimensional bin.
|
||||
bin_edges : list of ndarrays
|
||||
A list of D arrays describing the (nxi + 1) bin edges for each
|
||||
dimension.
|
||||
binnumber : (N,) array of ints or (D,N) ndarray of ints
|
||||
This assigns to each element of `sample` an integer that represents the
|
||||
bin in which this observation falls. The representation depends on the
|
||||
`expand_binnumbers` argument. See `Notes` for details.
|
||||
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
|
||||
|
||||
Notes
|
||||
-----
|
||||
Binedges:
|
||||
All but the last (righthand-most) bin is half-open in each dimension. In
|
||||
other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
|
||||
``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
|
||||
last bin, however, is ``[3, 4]``, which *includes* 4.
|
||||
|
||||
`binnumber`:
|
||||
This returned argument assigns to each element of `sample` an integer that
|
||||
represents the bin in which it belongs. The representation depends on the
|
||||
`expand_binnumbers` argument. If 'False' (default): The returned
|
||||
`binnumber` is a shape (N,) array of linearized indices mapping each
|
||||
element of `sample` to its corresponding bin (using row-major ordering).
|
||||
If 'True': The returned `binnumber` is a shape (D,N) ndarray where
|
||||
each row indicates bin placements for each dimension respectively. In each
|
||||
dimension, a binnumber of `i` means the corresponding value is between
|
||||
(bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from mpl_toolkits.mplot3d import Axes3D
|
||||
|
||||
Take an array of 600 (x, y) coordinates as an example.
|
||||
`binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
|
||||
of dimension `D+1` is required.
|
||||
|
||||
>>> mu = np.array([0., 1.])
|
||||
>>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
|
||||
>>> multinormal = stats.multivariate_normal(mu, sigma)
|
||||
>>> data = multinormal.rvs(size=600, random_state=235412)
|
||||
>>> data.shape
|
||||
(600, 2)
|
||||
|
||||
Create bins and count how many arrays fall in each bin:
|
||||
|
||||
>>> N = 60
|
||||
>>> x = np.linspace(-3, 3, N)
|
||||
>>> y = np.linspace(-3, 4, N)
|
||||
>>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
|
||||
... statistic='count')
|
||||
>>> bincounts = ret.statistic
|
||||
|
||||
Set the volume and the location of bars:
|
||||
|
||||
>>> dx = x[1] - x[0]
|
||||
>>> dy = y[1] - y[0]
|
||||
>>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
|
||||
>>> z = 0
|
||||
|
||||
>>> bincounts = bincounts.ravel()
|
||||
>>> x = x.ravel()
|
||||
>>> y = y.ravel()
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111, projection='3d')
|
||||
>>> with np.errstate(divide='ignore'): # silence random axes3d warning
|
||||
... ax.bar3d(x, y, z, dx, dy, bincounts)
|
||||
|
||||
Reuse bin numbers and bin edges with new values:
|
||||
|
||||
>>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
|
||||
... binned_statistic_result=ret,
|
||||
... statistic='mean')
|
||||
"""
|
||||
known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
|
||||
if not callable(statistic) and statistic not in known_stats:
|
||||
raise ValueError('invalid statistic %r' % (statistic,))
|
||||
|
||||
try:
|
||||
bins = index(bins)
|
||||
except TypeError:
|
||||
# bins is not an integer
|
||||
pass
|
||||
# If bins was an integer-like object, now it is an actual Python int.
|
||||
|
||||
# NOTE: for _bin_edges(), see e.g. gh-11365
|
||||
if isinstance(bins, int) and not np.isfinite(sample).all():
|
||||
raise ValueError('%r contains non-finite values.' % (sample,))
|
||||
|
||||
# `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
|
||||
# `Dlen` is the length of elements along each dimension.
|
||||
# This code is based on np.histogramdd
|
||||
try:
|
||||
# `sample` is an ND-array.
|
||||
Dlen, Ndim = sample.shape
|
||||
except (AttributeError, ValueError):
|
||||
# `sample` is a sequence of 1D arrays.
|
||||
sample = np.atleast_2d(sample).T
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
# Store initial shape of `values` to preserve it in the output
|
||||
values = np.asarray(values)
|
||||
input_shape = list(values.shape)
|
||||
# Make sure that `values` is 2D to iterate over rows
|
||||
values = np.atleast_2d(values)
|
||||
Vdim, Vlen = values.shape
|
||||
|
||||
# Make sure `values` match `sample`
|
||||
if statistic != 'count' and Vlen != Dlen:
|
||||
raise AttributeError('The number of `values` elements must match the '
|
||||
'length of each `sample` dimension.')
|
||||
|
||||
try:
|
||||
M = len(bins)
|
||||
if M != Ndim:
|
||||
raise AttributeError('The dimension of bins must be equal '
|
||||
'to the dimension of the sample x.')
|
||||
except TypeError:
|
||||
bins = Ndim * [bins]
|
||||
|
||||
if binned_statistic_result is None:
|
||||
nbin, edges, dedges = _bin_edges(sample, bins, range)
|
||||
binnumbers = _bin_numbers(sample, nbin, edges, dedges)
|
||||
else:
|
||||
edges = binned_statistic_result.bin_edges
|
||||
nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
|
||||
# +1 for outlier bins
|
||||
dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
|
||||
binnumbers = binned_statistic_result.binnumber
|
||||
|
||||
# Avoid overflow with double precision. Complex `values` -> `complex128`.
|
||||
result_type = np.result_type(values, np.float64)
|
||||
result = np.empty([Vdim, nbin.prod()], dtype=result_type)
|
||||
|
||||
if statistic in {'mean', np.mean}:
|
||||
result.fill(np.nan)
|
||||
flatcount = _bincount(binnumbers, None)
|
||||
a = flatcount.nonzero()
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = _bincount(binnumbers, values[vv])
|
||||
result[vv, a] = flatsum[a] / flatcount[a]
|
||||
elif statistic in {'std', np.std}:
|
||||
result.fill(np.nan)
|
||||
flatcount = _bincount(binnumbers, None)
|
||||
a = flatcount.nonzero()
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = _bincount(binnumbers, values[vv])
|
||||
delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
|
||||
std = np.sqrt(
|
||||
_bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a]
|
||||
)
|
||||
result[vv, a] = std
|
||||
result = np.real(result)
|
||||
elif statistic == 'count':
|
||||
result = np.empty([Vdim, nbin.prod()], dtype=np.float64)
|
||||
result.fill(0)
|
||||
flatcount = _bincount(binnumbers, None)
|
||||
a = np.arange(len(flatcount))
|
||||
result[:, a] = flatcount[np.newaxis, :]
|
||||
elif statistic in {'sum', np.sum}:
|
||||
result.fill(0)
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = _bincount(binnumbers, values[vv])
|
||||
a = np.arange(len(flatsum))
|
||||
result[vv, a] = flatsum
|
||||
elif statistic in {'median', np.median}:
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.lexsort((values[vv], binnumbers))
|
||||
_, j, counts = np.unique(binnumbers[i],
|
||||
return_index=True, return_counts=True)
|
||||
mid = j + (counts - 1) / 2
|
||||
mid_a = values[vv, i][np.floor(mid).astype(int)]
|
||||
mid_b = values[vv, i][np.ceil(mid).astype(int)]
|
||||
medians = (mid_a + mid_b) / 2
|
||||
result[vv, binnumbers[i][j]] = medians
|
||||
elif statistic in {'min', np.min}:
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.argsort(values[vv])[::-1] # Reversed so the min is last
|
||||
result[vv, binnumbers[i]] = values[vv, i]
|
||||
elif statistic in {'max', np.max}:
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.argsort(values[vv])
|
||||
result[vv, binnumbers[i]] = values[vv, i]
|
||||
elif callable(statistic):
|
||||
with np.errstate(invalid='ignore'), suppress_warnings() as sup:
|
||||
sup.filter(RuntimeWarning)
|
||||
try:
|
||||
null = statistic([])
|
||||
except Exception:
|
||||
null = np.nan
|
||||
if np.iscomplexobj(null):
|
||||
result = result.astype(np.complex128)
|
||||
result.fill(null)
|
||||
try:
|
||||
_calc_binned_statistic(
|
||||
Vdim, binnumbers, result, values, statistic
|
||||
)
|
||||
except ValueError:
|
||||
result = result.astype(np.complex128)
|
||||
_calc_binned_statistic(
|
||||
Vdim, binnumbers, result, values, statistic
|
||||
)
|
||||
|
||||
# Shape into a proper matrix
|
||||
result = result.reshape(np.append(Vdim, nbin))
|
||||
|
||||
# Remove outliers (indices 0 and -1 for each bin-dimension).
|
||||
core = tuple([slice(None)] + Ndim * [slice(1, -1)])
|
||||
result = result[core]
|
||||
|
||||
# Unravel binnumbers into an ndarray, each row the bins for each dimension
|
||||
if expand_binnumbers and Ndim > 1:
|
||||
binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
|
||||
|
||||
if np.any(result.shape[1:] != nbin - 2):
|
||||
raise RuntimeError('Internal Shape Error')
|
||||
|
||||
# Reshape to have output (`result`) match input (`values`) shape
|
||||
result = result.reshape(input_shape[:-1] + list(nbin-2))
|
||||
|
||||
return BinnedStatisticddResult(result, edges, binnumbers)
|
||||
|
||||
|
||||
def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
|
||||
unique_bin_numbers = np.unique(bin_numbers)
|
||||
for vv in builtins.range(Vdim):
|
||||
bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
|
||||
values, vv)
|
||||
for i in unique_bin_numbers:
|
||||
stat = stat_func(np.array(bin_map[i]))
|
||||
if np.iscomplexobj(stat) and not np.iscomplexobj(result):
|
||||
raise ValueError("The statistic function returns complex ")
|
||||
result[vv, i] = stat
|
||||
|
||||
|
||||
def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
|
||||
""" Create hashmap of bin ids to values in bins
|
||||
key: bin number
|
||||
value: list of binned data
|
||||
"""
|
||||
bin_map = dict()
|
||||
for i in unique_bin_numbers:
|
||||
bin_map[i] = []
|
||||
for i in builtins.range(len(bin_numbers)):
|
||||
bin_map[bin_numbers[i]].append(values[vv, i])
|
||||
return bin_map
|
||||
|
||||
|
||||
def _bin_edges(sample, bins=None, range=None):
|
||||
""" Create edge arrays
|
||||
"""
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
nbin = np.empty(Ndim, int) # Number of bins in each dimension
|
||||
edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
|
||||
dedges = Ndim * [None] # Spacing between edges (will be 2D array)
|
||||
|
||||
# Select range for each dimension
|
||||
# Used only if number of bins is given.
|
||||
if range is None:
|
||||
smin = np.atleast_1d(np.array(sample.min(axis=0), float))
|
||||
smax = np.atleast_1d(np.array(sample.max(axis=0), float))
|
||||
else:
|
||||
if len(range) != Ndim:
|
||||
raise ValueError(
|
||||
f"range given for {len(range)} dimensions; {Ndim} required")
|
||||
smin = np.empty(Ndim)
|
||||
smax = np.empty(Ndim)
|
||||
for i in builtins.range(Ndim):
|
||||
if range[i][1] < range[i][0]:
|
||||
raise ValueError(
|
||||
"In {}range, start must be <= stop".format(
|
||||
f"dimension {i + 1} of " if Ndim > 1 else ""))
|
||||
smin[i], smax[i] = range[i]
|
||||
|
||||
# Make sure the bins have a finite width.
|
||||
for i in builtins.range(len(smin)):
|
||||
if smin[i] == smax[i]:
|
||||
smin[i] = smin[i] - .5
|
||||
smax[i] = smax[i] + .5
|
||||
|
||||
# Preserve sample floating point precision in bin edges
|
||||
edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
|
||||
else float)
|
||||
|
||||
# Create edge arrays
|
||||
for i in builtins.range(Ndim):
|
||||
if np.isscalar(bins[i]):
|
||||
nbin[i] = bins[i] + 2 # +2 for outlier bins
|
||||
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
|
||||
dtype=edges_dtype)
|
||||
else:
|
||||
edges[i] = np.asarray(bins[i], edges_dtype)
|
||||
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
|
||||
dedges[i] = np.diff(edges[i])
|
||||
|
||||
nbin = np.asarray(nbin)
|
||||
|
||||
return nbin, edges, dedges
|
||||
|
||||
|
||||
def _bin_numbers(sample, nbin, edges, dedges):
|
||||
"""Compute the bin number each sample falls into, in each dimension
|
||||
"""
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
sampBin = [
|
||||
np.digitize(sample[:, i], edges[i])
|
||||
for i in range(Ndim)
|
||||
]
|
||||
|
||||
# Using `digitize`, values that fall on an edge are put in the right bin.
|
||||
# For the rightmost bin, we want values equal to the right
|
||||
# edge to be counted in the last bin, and not as an outlier.
|
||||
for i in range(Ndim):
|
||||
# Find the rounding precision
|
||||
dedges_min = dedges[i].min()
|
||||
if dedges_min == 0:
|
||||
raise ValueError('The smallest edge difference is numerically 0.')
|
||||
decimal = int(-np.log10(dedges_min)) + 6
|
||||
# Find which points are on the rightmost edge.
|
||||
on_edge = np.where((sample[:, i] >= edges[i][-1]) &
|
||||
(np.around(sample[:, i], decimal) ==
|
||||
np.around(edges[i][-1], decimal)))[0]
|
||||
# Shift these points one bin to the left.
|
||||
sampBin[i][on_edge] -= 1
|
||||
|
||||
# Compute the sample indices in the flattened statistic matrix.
|
||||
binnumbers = np.ravel_multi_index(sampBin, nbin)
|
||||
|
||||
return binnumbers
|
||||
375
.CondaPkg/env/Lib/site-packages/scipy/stats/_binomtest.py
vendored
Normal file
375
.CondaPkg/env/Lib/site-packages/scipy/stats/_binomtest.py
vendored
Normal file
@@ -0,0 +1,375 @@
|
||||
from math import sqrt
|
||||
import numpy as np
|
||||
from scipy._lib._util import _validate_int
|
||||
from scipy.optimize import brentq
|
||||
from scipy.special import ndtri
|
||||
from ._discrete_distns import binom
|
||||
from ._common import ConfidenceInterval
|
||||
|
||||
|
||||
class BinomTestResult:
|
||||
"""
|
||||
Result of `scipy.stats.binomtest`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
k : int
|
||||
The number of successes (copied from `binomtest` input).
|
||||
n : int
|
||||
The number of trials (copied from `binomtest` input).
|
||||
alternative : str
|
||||
Indicates the alternative hypothesis specified in the input
|
||||
to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
|
||||
or ``'less'``.
|
||||
statistic: float
|
||||
The estimate of the proportion of successes.
|
||||
pvalue : float
|
||||
The p-value of the hypothesis test.
|
||||
|
||||
"""
|
||||
def __init__(self, k, n, alternative, statistic, pvalue):
|
||||
self.k = k
|
||||
self.n = n
|
||||
self.alternative = alternative
|
||||
self.statistic = statistic
|
||||
self.pvalue = pvalue
|
||||
|
||||
# add alias for backward compatibility
|
||||
self.proportion_estimate = statistic
|
||||
|
||||
def __repr__(self):
|
||||
s = ("BinomTestResult("
|
||||
f"k={self.k}, "
|
||||
f"n={self.n}, "
|
||||
f"alternative={self.alternative!r}, "
|
||||
f"statistic={self.statistic}, "
|
||||
f"pvalue={self.pvalue})")
|
||||
return s
|
||||
|
||||
def proportion_ci(self, confidence_level=0.95, method='exact'):
|
||||
"""
|
||||
Compute the confidence interval for ``statistic``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, optional
|
||||
Confidence level for the computed confidence interval
|
||||
of the estimated proportion. Default is 0.95.
|
||||
method : {'exact', 'wilson', 'wilsoncc'}, optional
|
||||
Selects the method used to compute the confidence interval
|
||||
for the estimate of the proportion:
|
||||
|
||||
'exact' :
|
||||
Use the Clopper-Pearson exact method [1]_.
|
||||
'wilson' :
|
||||
Wilson's method, without continuity correction ([2]_, [3]_).
|
||||
'wilsoncc' :
|
||||
Wilson's method, with continuity correction ([2]_, [3]_).
|
||||
|
||||
Default is ``'exact'``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ``ConfidenceInterval`` object
|
||||
The object has attributes ``low`` and ``high`` that hold the
|
||||
lower and upper bounds of the confidence interval.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] C. J. Clopper and E. S. Pearson, The use of confidence or
|
||||
fiducial limits illustrated in the case of the binomial,
|
||||
Biometrika, Vol. 26, No. 4, pp 404-413 (Dec. 1934).
|
||||
.. [2] E. B. Wilson, Probable inference, the law of succession, and
|
||||
statistical inference, J. Amer. Stat. Assoc., 22, pp 209-212
|
||||
(1927).
|
||||
.. [3] Robert G. Newcombe, Two-sided confidence intervals for the
|
||||
single proportion: comparison of seven methods, Statistics
|
||||
in Medicine, 17, pp 857-872 (1998).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import binomtest
|
||||
>>> result = binomtest(k=7, n=50, p=0.1)
|
||||
>>> result.statistic
|
||||
0.14
|
||||
>>> result.proportion_ci()
|
||||
ConfidenceInterval(low=0.05819170033997342, high=0.26739600249700846)
|
||||
"""
|
||||
if method not in ('exact', 'wilson', 'wilsoncc'):
|
||||
raise ValueError("method must be one of 'exact', 'wilson' or "
|
||||
"'wilsoncc'.")
|
||||
if not (0 <= confidence_level <= 1):
|
||||
raise ValueError('confidence_level must be in the interval '
|
||||
'[0, 1].')
|
||||
if method == 'exact':
|
||||
low, high = _binom_exact_conf_int(self.k, self.n,
|
||||
confidence_level,
|
||||
self.alternative)
|
||||
else:
|
||||
# method is 'wilson' or 'wilsoncc'
|
||||
low, high = _binom_wilson_conf_int(self.k, self.n,
|
||||
confidence_level,
|
||||
self.alternative,
|
||||
correction=method == 'wilsoncc')
|
||||
return ConfidenceInterval(low=low, high=high)
|
||||
|
||||
|
||||
def _findp(func):
|
||||
try:
|
||||
p = brentq(func, 0, 1)
|
||||
except RuntimeError:
|
||||
raise RuntimeError('numerical solver failed to converge when '
|
||||
'computing the confidence limits') from None
|
||||
except ValueError as exc:
|
||||
raise ValueError('brentq raised a ValueError; report this to the '
|
||||
'SciPy developers') from exc
|
||||
return p
|
||||
|
||||
|
||||
def _binom_exact_conf_int(k, n, confidence_level, alternative):
|
||||
"""
|
||||
Compute the estimate and confidence interval for the binomial test.
|
||||
|
||||
Returns proportion, prop_low, prop_high
|
||||
"""
|
||||
if alternative == 'two-sided':
|
||||
alpha = (1 - confidence_level) / 2
|
||||
if k == 0:
|
||||
plow = 0.0
|
||||
else:
|
||||
plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
|
||||
if k == n:
|
||||
phigh = 1.0
|
||||
else:
|
||||
phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
|
||||
elif alternative == 'less':
|
||||
alpha = 1 - confidence_level
|
||||
plow = 0.0
|
||||
if k == n:
|
||||
phigh = 1.0
|
||||
else:
|
||||
phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
|
||||
elif alternative == 'greater':
|
||||
alpha = 1 - confidence_level
|
||||
if k == 0:
|
||||
plow = 0.0
|
||||
else:
|
||||
plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
|
||||
phigh = 1.0
|
||||
return plow, phigh
|
||||
|
||||
|
||||
def _binom_wilson_conf_int(k, n, confidence_level, alternative, correction):
|
||||
# This function assumes that the arguments have already been validated.
|
||||
# In particular, `alternative` must be one of 'two-sided', 'less' or
|
||||
# 'greater'.
|
||||
p = k / n
|
||||
if alternative == 'two-sided':
|
||||
z = ndtri(0.5 + 0.5*confidence_level)
|
||||
else:
|
||||
z = ndtri(confidence_level)
|
||||
|
||||
# For reference, the formulas implemented here are from
|
||||
# Newcombe (1998) (ref. [3] in the proportion_ci docstring).
|
||||
denom = 2*(n + z**2)
|
||||
center = (2*n*p + z**2)/denom
|
||||
q = 1 - p
|
||||
if correction:
|
||||
if alternative == 'less' or k == 0:
|
||||
lo = 0.0
|
||||
else:
|
||||
dlo = (1 + z*sqrt(z**2 - 2 - 1/n + 4*p*(n*q + 1))) / denom
|
||||
lo = center - dlo
|
||||
if alternative == 'greater' or k == n:
|
||||
hi = 1.0
|
||||
else:
|
||||
dhi = (1 + z*sqrt(z**2 + 2 - 1/n + 4*p*(n*q - 1))) / denom
|
||||
hi = center + dhi
|
||||
else:
|
||||
delta = z/denom * sqrt(4*n*p*q + z**2)
|
||||
if alternative == 'less' or k == 0:
|
||||
lo = 0.0
|
||||
else:
|
||||
lo = center - delta
|
||||
if alternative == 'greater' or k == n:
|
||||
hi = 1.0
|
||||
else:
|
||||
hi = center + delta
|
||||
|
||||
return lo, hi
|
||||
|
||||
|
||||
def binomtest(k, n, p=0.5, alternative='two-sided'):
|
||||
"""
|
||||
Perform a test that the probability of success is p.
|
||||
|
||||
The binomial test [1]_ is a test of the null hypothesis that the
|
||||
probability of success in a Bernoulli experiment is `p`.
|
||||
|
||||
Details of the test can be found in many texts on statistics, such
|
||||
as section 24.5 of [2]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k : int
|
||||
The number of successes.
|
||||
n : int
|
||||
The number of trials.
|
||||
p : float, optional
|
||||
The hypothesized probability of success, i.e. the expected
|
||||
proportion of successes. The value must be in the interval
|
||||
``0 <= p <= 1``. The default value is ``p = 0.5``.
|
||||
alternative : {'two-sided', 'greater', 'less'}, optional
|
||||
Indicates the alternative hypothesis. The default value is
|
||||
'two-sided'.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : `~scipy.stats._result_classes.BinomTestResult` instance
|
||||
The return value is an object with the following attributes:
|
||||
|
||||
k : int
|
||||
The number of successes (copied from `binomtest` input).
|
||||
n : int
|
||||
The number of trials (copied from `binomtest` input).
|
||||
alternative : str
|
||||
Indicates the alternative hypothesis specified in the input
|
||||
to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
|
||||
or ``'less'``.
|
||||
statistic : float
|
||||
The estimate of the proportion of successes.
|
||||
pvalue : float
|
||||
The p-value of the hypothesis test.
|
||||
|
||||
The object has the following methods:
|
||||
|
||||
proportion_ci(confidence_level=0.95, method='exact') :
|
||||
Compute the confidence interval for ``statistic``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Binomial test, https://en.wikipedia.org/wiki/Binomial_test
|
||||
.. [2] Jerrold H. Zar, Biostatistical Analysis (fifth edition),
|
||||
Prentice Hall, Upper Saddle River, New Jersey USA (2010)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import binomtest
|
||||
|
||||
A car manufacturer claims that no more than 10% of their cars are unsafe.
|
||||
15 cars are inspected for safety, 3 were found to be unsafe. Test the
|
||||
manufacturer's claim:
|
||||
|
||||
>>> result = binomtest(3, n=15, p=0.1, alternative='greater')
|
||||
>>> result.pvalue
|
||||
0.18406106910639114
|
||||
|
||||
The null hypothesis cannot be rejected at the 5% level of significance
|
||||
because the returned p-value is greater than the critical value of 5%.
|
||||
|
||||
The test statistic is equal to the estimated proportion, which is simply
|
||||
``3/15``:
|
||||
|
||||
>>> result.statistic
|
||||
0.2
|
||||
|
||||
We can use the `proportion_ci()` method of the result to compute the
|
||||
confidence interval of the estimate:
|
||||
|
||||
>>> result.proportion_ci(confidence_level=0.95)
|
||||
ConfidenceInterval(low=0.05684686759024681, high=1.0)
|
||||
|
||||
"""
|
||||
k = _validate_int(k, 'k', minimum=0)
|
||||
n = _validate_int(n, 'n', minimum=1)
|
||||
if k > n:
|
||||
raise ValueError('k must not be greater than n.')
|
||||
|
||||
if not (0 <= p <= 1):
|
||||
raise ValueError("p must be in range [0,1]")
|
||||
|
||||
if alternative not in ('two-sided', 'less', 'greater'):
|
||||
raise ValueError("alternative not recognized; \n"
|
||||
"must be 'two-sided', 'less' or 'greater'")
|
||||
if alternative == 'less':
|
||||
pval = binom.cdf(k, n, p)
|
||||
elif alternative == 'greater':
|
||||
pval = binom.sf(k-1, n, p)
|
||||
else:
|
||||
# alternative is 'two-sided'
|
||||
d = binom.pmf(k, n, p)
|
||||
rerr = 1 + 1e-7
|
||||
if k == p * n:
|
||||
# special case as shortcut, would also be handled by `else` below
|
||||
pval = 1.
|
||||
elif k < p * n:
|
||||
ix = _binary_search_for_binom_tst(lambda x1: -binom.pmf(x1, n, p),
|
||||
-d*rerr, np.ceil(p * n), n)
|
||||
# y is the number of terms between mode and n that are <= d*rerr.
|
||||
# ix gave us the first term where a(ix) <= d*rerr < a(ix-1)
|
||||
# if the first equality doesn't hold, y=n-ix. Otherwise, we
|
||||
# need to include ix as well as the equality holds. Note that
|
||||
# the equality will hold in very very rare situations due to rerr.
|
||||
y = n - ix + int(d*rerr == binom.pmf(ix, n, p))
|
||||
pval = binom.cdf(k, n, p) + binom.sf(n - y, n, p)
|
||||
else:
|
||||
ix = _binary_search_for_binom_tst(lambda x1: binom.pmf(x1, n, p),
|
||||
d*rerr, 0, np.floor(p * n))
|
||||
# y is the number of terms between 0 and mode that are <= d*rerr.
|
||||
# we need to add a 1 to account for the 0 index.
|
||||
# For comparing this with old behavior, see
|
||||
# tst_binary_srch_for_binom_tst method in test_morestats.
|
||||
y = ix + 1
|
||||
pval = binom.cdf(y-1, n, p) + binom.sf(k-1, n, p)
|
||||
|
||||
pval = min(1.0, pval)
|
||||
|
||||
result = BinomTestResult(k=k, n=n, alternative=alternative,
|
||||
statistic=k/n, pvalue=pval)
|
||||
return result
|
||||
|
||||
|
||||
def _binary_search_for_binom_tst(a, d, lo, hi):
|
||||
"""
|
||||
Conducts an implicit binary search on a function specified by `a`.
|
||||
|
||||
Meant to be used on the binomial PMF for the case of two-sided tests
|
||||
to obtain the value on the other side of the mode where the tail
|
||||
probability should be computed. The values on either side of
|
||||
the mode are always in order, meaning binary search is applicable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : callable
|
||||
The function over which to perform binary search. Its values
|
||||
for inputs lo and hi should be in ascending order.
|
||||
d : float
|
||||
The value to search.
|
||||
lo : int
|
||||
The lower end of range to search.
|
||||
hi : int
|
||||
The higher end of the range to search.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int
|
||||
The index, i between lo and hi
|
||||
such that a(i)<=d<a(i+1)
|
||||
"""
|
||||
while lo < hi:
|
||||
mid = lo + (hi-lo)//2
|
||||
midval = a(mid)
|
||||
if midval < d:
|
||||
lo = mid+1
|
||||
elif midval > d:
|
||||
hi = mid-1
|
||||
else:
|
||||
return mid
|
||||
if a(lo) <= d:
|
||||
return lo
|
||||
else:
|
||||
return lo-1
|
||||
53
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/__init__.py
vendored
Normal file
53
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/__init__.py
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
from scipy.stats._boost.beta_ufunc import (
|
||||
_beta_pdf, _beta_cdf, _beta_sf, _beta_ppf,
|
||||
_beta_isf, _beta_mean, _beta_variance,
|
||||
_beta_skewness, _beta_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.binom_ufunc import (
|
||||
_binom_pdf, _binom_cdf, _binom_sf, _binom_ppf,
|
||||
_binom_isf, _binom_mean, _binom_variance,
|
||||
_binom_skewness, _binom_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.nbinom_ufunc import (
|
||||
_nbinom_pdf, _nbinom_cdf, _nbinom_sf, _nbinom_ppf,
|
||||
_nbinom_isf, _nbinom_mean, _nbinom_variance,
|
||||
_nbinom_skewness, _nbinom_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.hypergeom_ufunc import (
|
||||
_hypergeom_pdf, _hypergeom_cdf, _hypergeom_sf, _hypergeom_ppf,
|
||||
_hypergeom_isf, _hypergeom_mean, _hypergeom_variance,
|
||||
_hypergeom_skewness, _hypergeom_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.ncf_ufunc import (
|
||||
_ncf_pdf, _ncf_cdf, _ncf_sf, _ncf_ppf,
|
||||
_ncf_isf, _ncf_mean, _ncf_variance,
|
||||
_ncf_skewness, _ncf_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.ncx2_ufunc import (
|
||||
_ncx2_pdf, _ncx2_cdf, _ncx2_sf, _ncx2_ppf,
|
||||
_ncx2_isf, _ncx2_mean, _ncx2_variance,
|
||||
_ncx2_skewness, _ncx2_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.nct_ufunc import (
|
||||
_nct_pdf, _nct_cdf, _nct_sf, _nct_ppf,
|
||||
_nct_isf, _nct_mean, _nct_variance,
|
||||
_nct_skewness, _nct_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.skewnorm_ufunc import (
|
||||
_skewnorm_pdf, _skewnorm_cdf, _skewnorm_sf, _skewnorm_ppf,
|
||||
_skewnorm_isf, _skewnorm_mean, _skewnorm_variance,
|
||||
_skewnorm_skewness, _skewnorm_kurtosis_excess,
|
||||
)
|
||||
|
||||
from scipy.stats._boost.invgauss_ufunc import (
|
||||
_invgauss_pdf, _invgauss_cdf, _invgauss_sf, _invgauss_ppf,
|
||||
_invgauss_isf, _invgauss_mean, _invgauss_variance,
|
||||
_invgauss_skewness, _invgauss_kurtosis_excess,
|
||||
)
|
||||
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/__pycache__/__init__.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/__pycache__/__init__.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/beta_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/beta_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/beta_ufunc.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/beta_ufunc.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/binom_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/binom_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/binom_ufunc.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/binom_ufunc.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/hypergeom_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/hypergeom_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/hypergeom_ufunc.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/hypergeom_ufunc.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/invgauss_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/invgauss_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/invgauss_ufunc.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/invgauss_ufunc.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/nbinom_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/nbinom_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/nbinom_ufunc.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/nbinom_ufunc.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/ncf_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/ncf_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/ncf_ufunc.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/ncf_ufunc.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/nct_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/nct_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/nct_ufunc.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/nct_ufunc.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/ncx2_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/ncx2_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/ncx2_ufunc.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/ncx2_ufunc.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/skewnorm_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/skewnorm_ufunc.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/skewnorm_ufunc.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_boost/skewnorm_ufunc.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
6
.CondaPkg/env/Lib/site-packages/scipy/stats/_common.py
vendored
Normal file
6
.CondaPkg/env/Lib/site-packages/scipy/stats/_common.py
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
ConfidenceInterval = namedtuple("ConfidenceInterval", ["low", "high"])
|
||||
ConfidenceInterval. __doc__ = "Class for confidence intervals."
|
||||
34
.CondaPkg/env/Lib/site-packages/scipy/stats/_constants.py
vendored
Normal file
34
.CondaPkg/env/Lib/site-packages/scipy/stats/_constants.py
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
"""
|
||||
Statistics-related constants.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
# The smallest representable positive number such that 1.0 + _EPS != 1.0.
|
||||
_EPS = np.finfo(float).eps
|
||||
|
||||
# The largest [in magnitude] usable floating value.
|
||||
_XMAX = np.finfo(float).max
|
||||
|
||||
# The log of the largest usable floating value; useful for knowing
|
||||
# when exp(something) will overflow
|
||||
_LOGXMAX = np.log(_XMAX)
|
||||
|
||||
# The smallest [in magnitude] usable floating value.
|
||||
_XMIN = np.finfo(float).tiny
|
||||
|
||||
# -special.psi(1)
|
||||
_EULER = 0.577215664901532860606512090082402431042
|
||||
|
||||
# special.zeta(3, 1) Apery's constant
|
||||
_ZETA3 = 1.202056903159594285399738161511449990765
|
||||
|
||||
# sqrt(pi)
|
||||
_SQRT_PI = 1.772453850905516027298167483341145182798
|
||||
|
||||
# sqrt(2/pi)
|
||||
_SQRT_2_OVER_PI = 0.7978845608028654
|
||||
|
||||
# log(sqrt(2/pi))
|
||||
_LOG_SQRT_2_OVER_PI = -0.22579135264472744
|
||||
10314
.CondaPkg/env/Lib/site-packages/scipy/stats/_continuous_distns.py
vendored
Normal file
10314
.CondaPkg/env/Lib/site-packages/scipy/stats/_continuous_distns.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
629
.CondaPkg/env/Lib/site-packages/scipy/stats/_covariance.py
vendored
Normal file
629
.CondaPkg/env/Lib/site-packages/scipy/stats/_covariance.py
vendored
Normal file
@@ -0,0 +1,629 @@
|
||||
from functools import cached_property
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
from scipy.stats import _multivariate
|
||||
|
||||
|
||||
__all__ = ["Covariance"]
|
||||
|
||||
|
||||
class Covariance:
|
||||
"""
|
||||
Representation of a covariance matrix
|
||||
|
||||
Calculations involving covariance matrices (e.g. data whitening,
|
||||
multivariate normal function evaluation) are often performed more
|
||||
efficiently using a decomposition of the covariance matrix instead of the
|
||||
covariance metrix itself. This class allows the user to construct an
|
||||
object representing a covariance matrix using any of several
|
||||
decompositions and perform calculations using a common interface.
|
||||
|
||||
.. note::
|
||||
|
||||
The `Covariance` class cannot be instantiated directly. Instead, use
|
||||
one of the factory methods (e.g. `Covariance.from_diagonal`).
|
||||
|
||||
Examples
|
||||
--------
|
||||
The `Covariance` class is is used by calling one of its
|
||||
factory methods to create a `Covariance` object, then pass that
|
||||
representation of the `Covariance` matrix as a shape parameter of a
|
||||
multivariate distribution.
|
||||
|
||||
For instance, the multivariate normal distribution can accept an array
|
||||
representing a covariance matrix:
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> import numpy as np
|
||||
>>> d = [1, 2, 3]
|
||||
>>> A = np.diag(d) # a diagonal covariance matrix
|
||||
>>> x = [4, -2, 5] # a point of interest
|
||||
>>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=A)
|
||||
>>> dist.pdf(x)
|
||||
4.9595685102808205e-08
|
||||
|
||||
but the calculations are performed in a very generic way that does not
|
||||
take advantage of any special properties of the covariance matrix. Because
|
||||
our covariance matrix is diagonal, we can use ``Covariance.from_diagonal``
|
||||
to create an object representing the covariance matrix, and
|
||||
`multivariate_normal` can use this to compute the probability density
|
||||
function more efficiently.
|
||||
|
||||
>>> cov = stats.Covariance.from_diagonal(d)
|
||||
>>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=cov)
|
||||
>>> dist.pdf(x)
|
||||
4.9595685102808205e-08
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
message = ("The `Covariance` class cannot be instantiated directly. "
|
||||
"Please use one of the factory methods "
|
||||
"(e.g. `Covariance.from_diagonal`).")
|
||||
raise NotImplementedError(message)
|
||||
|
||||
@staticmethod
|
||||
def from_diagonal(diagonal):
|
||||
r"""
|
||||
Return a representation of a covariance matrix from its diagonal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
diagonal : array_like
|
||||
The diagonal elements of a diagonal matrix.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the diagonal elements of a diagonal covariance matrix :math:`D` be
|
||||
stored in the vector :math:`d`.
|
||||
|
||||
When all elements of :math:`d` are strictly positive, whitening of a
|
||||
data point :math:`x` is performed by computing
|
||||
:math:`x \cdot d^{-1/2}`, where the inverse square root can be taken
|
||||
element-wise.
|
||||
:math:`\log\det{D}` is calculated as :math:`-2 \sum(\log{d})`,
|
||||
where the :math:`\log` operation is performed element-wise.
|
||||
|
||||
This `Covariance` class supports singular covariance matrices. When
|
||||
computing ``_log_pdet``, non-positive elements of :math:`d` are
|
||||
ignored. Whitening is not well defined when the point to be whitened
|
||||
does not lie in the span of the columns of the covariance matrix. The
|
||||
convention taken here is to treat the inverse square root of
|
||||
non-positive elements of :math:`d` as zeros.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite covariance matrix ``A`` and a
|
||||
data point ``x``.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> A = np.diag(rng.random(n))
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Extract the diagonal from ``A`` and create the `Covariance` object.
|
||||
|
||||
>>> d = np.diag(A)
|
||||
>>> cov = stats.Covariance.from_diagonal(d)
|
||||
|
||||
Compare the functionality of the `Covariance` object against a
|
||||
reference implementations.
|
||||
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = np.diag(d**-0.5) @ x
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = np.linalg.slogdet(A)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaDiagonal(diagonal)
|
||||
|
||||
@staticmethod
|
||||
def from_precision(precision, covariance=None):
|
||||
r"""
|
||||
Return a representation of a covariance from its precision matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
precision : array_like
|
||||
The precision matrix; that is, the inverse of a square, symmetric,
|
||||
positive definite covariance matrix.
|
||||
covariance : array_like, optional
|
||||
The square, symmetric, positive definite covariance matrix. If not
|
||||
provided, this may need to be calculated (e.g. to evaluate the
|
||||
cumulative distribution function of
|
||||
`scipy.stats.multivariate_normal`) by inverting `precision`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the covariance matrix be :math:`A`, its precision matrix be
|
||||
:math:`P = A^{-1}`, and :math:`L` be the lower Cholesky factor such
|
||||
that :math:`L L^T = P`.
|
||||
Whitening of a data point :math:`x` is performed by computing
|
||||
:math:`x^T L`. :math:`\log\det{A}` is calculated as
|
||||
:math:`-2tr(\log{L})`, where the :math:`\log` operation is performed
|
||||
element-wise.
|
||||
|
||||
This `Covariance` class does not support singular covariance matrices
|
||||
because the precision matrix does not exist for a singular covariance
|
||||
matrix.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite precision matrix ``P`` and a
|
||||
data point ``x``. (If the precision matrix is not already available,
|
||||
consider the other factory methods of the `Covariance` class.)
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> P = rng.random(size=(n, n))
|
||||
>>> P = P @ P.T # a precision matrix must be positive definite
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Create the `Covariance` object.
|
||||
|
||||
>>> cov = stats.Covariance.from_precision(P)
|
||||
|
||||
Compare the functionality of the `Covariance` object against
|
||||
reference implementations.
|
||||
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = x @ np.linalg.cholesky(P)
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = -np.linalg.slogdet(P)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaPrecision(precision, covariance)
|
||||
|
||||
@staticmethod
|
||||
def from_cholesky(cholesky):
|
||||
r"""
|
||||
Representation of a covariance provided via the (lower) Cholesky factor
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cholesky : array_like
|
||||
The lower triangular Cholesky factor of the covariance matrix.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the covariance matrix be :math:`A`and :math:`L` be the lower
|
||||
Cholesky factor such that :math:`L L^T = A`.
|
||||
Whitening of a data point :math:`x` is performed by computing
|
||||
:math:`L^{-1} x`. :math:`\log\det{A}` is calculated as
|
||||
:math:`2tr(\log{L})`, where the :math:`\log` operation is performed
|
||||
element-wise.
|
||||
|
||||
This `Covariance` class does not support singular covariance matrices
|
||||
because the Cholesky decomposition does not exist for a singular
|
||||
covariance matrix.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite covariance matrix ``A`` and a
|
||||
data point ``x``.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> A = A @ A.T # make the covariance symmetric positive definite
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Perform the Cholesky decomposition of ``A`` and create the
|
||||
`Covariance` object.
|
||||
|
||||
>>> L = np.linalg.cholesky(A)
|
||||
>>> cov = stats.Covariance.from_cholesky(L)
|
||||
|
||||
Compare the functionality of the `Covariance` object against
|
||||
reference implementation.
|
||||
|
||||
>>> from scipy.linalg import solve_triangular
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = solve_triangular(L, x, lower=True)
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = np.linalg.slogdet(A)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaCholesky(cholesky)
|
||||
|
||||
@staticmethod
|
||||
def from_eigendecomposition(eigendecomposition):
|
||||
r"""
|
||||
Representation of a covariance provided via eigendecomposition
|
||||
|
||||
Parameters
|
||||
----------
|
||||
eigendecomposition : sequence
|
||||
A sequence (nominally a tuple) containing the eigenvalue and
|
||||
eigenvector arrays as computed by `scipy.linalg.eigh` or
|
||||
`numpy.linalg.eigh`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the covariance matrix be :math:`A`, let :math:`V` be matrix of
|
||||
eigenvectors, and let :math:`W` be the diagonal matrix of eigenvalues
|
||||
such that `V W V^T = A`.
|
||||
|
||||
When all of the eigenvalues are strictly positive, whitening of a
|
||||
data point :math:`x` is performed by computing
|
||||
:math:`x^T (V W^{-1/2})`, where the inverse square root can be taken
|
||||
element-wise.
|
||||
:math:`\log\det{A}` is calculated as :math:`tr(\log{W})`,
|
||||
where the :math:`\log` operation is performed element-wise.
|
||||
|
||||
This `Covariance` class supports singular covariance matrices. When
|
||||
computing ``_log_pdet``, non-positive eigenvalues are ignored.
|
||||
Whitening is not well defined when the point to be whitened
|
||||
does not lie in the span of the columns of the covariance matrix. The
|
||||
convention taken here is to treat the inverse square root of
|
||||
non-positive eigenvalues as zeros.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite covariance matrix ``A`` and a
|
||||
data point ``x``.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> A = A @ A.T # make the covariance symmetric positive definite
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Perform the eigendecomposition of ``A`` and create the `Covariance`
|
||||
object.
|
||||
|
||||
>>> w, v = np.linalg.eigh(A)
|
||||
>>> cov = stats.Covariance.from_eigendecomposition((w, v))
|
||||
|
||||
Compare the functionality of the `Covariance` object against
|
||||
reference implementations.
|
||||
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = x @ (v @ np.diag(w**-0.5))
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = np.linalg.slogdet(A)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaEigendecomposition(eigendecomposition)
|
||||
|
||||
def whiten(self, x):
|
||||
"""
|
||||
Perform a whitening transformation on data.
|
||||
|
||||
"Whitening" ("white" as in "white noise", in which each frequency has
|
||||
equal magnitude) transforms a set of random variables into a new set of
|
||||
random variables with unit-diagonal covariance. When a whitening
|
||||
transform is applied to a sample of points distributed according to
|
||||
a multivariate normal distribution with zero mean, the covariance of
|
||||
the transformed sample is approximately the identity matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
An array of points. The last dimension must correspond with the
|
||||
dimensionality of the space, i.e., the number of columns in the
|
||||
covariance matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x_ : array_like
|
||||
The transformed array of points.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Whitening Transformation". Wikipedia.
|
||||
https://en.wikipedia.org/wiki/Whitening_transformation
|
||||
.. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
|
||||
coloring linear transformation". Transactions of VSB 18.2
|
||||
(2018): 31-35. :doi:`10.31490/tces-2018-0013`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 3
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> cov_array = A @ A.T # make matrix symmetric positive definite
|
||||
>>> precision = np.linalg.inv(cov_array)
|
||||
>>> cov_object = stats.Covariance.from_precision(precision)
|
||||
>>> x = rng.multivariate_normal(np.zeros(n), cov_array, size=(10000))
|
||||
>>> x_ = cov_object.whiten(x)
|
||||
>>> np.cov(x_, rowvar=False) # near-identity covariance
|
||||
array([[0.97862122, 0.00893147, 0.02430451],
|
||||
[0.00893147, 0.96719062, 0.02201312],
|
||||
[0.02430451, 0.02201312, 0.99206881]])
|
||||
|
||||
"""
|
||||
return self._whiten(np.asarray(x))
|
||||
|
||||
def colorize(self, x):
|
||||
"""
|
||||
Perform a colorizing transformation on data.
|
||||
|
||||
"Colorizing" ("color" as in "colored noise", in which different
|
||||
frequencies may have different magnitudes) transforms a set of
|
||||
uncorrelated random variables into a new set of random variables with
|
||||
the desired covariance. When a coloring transform is applied to a
|
||||
sample of points distributed according to a multivariate normal
|
||||
distribution with identity covariance and zero mean, the covariance of
|
||||
the transformed sample is approximately the covariance matrix used
|
||||
in the coloring transform.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
An array of points. The last dimension must correspond with the
|
||||
dimensionality of the space, i.e., the number of columns in the
|
||||
covariance matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x_ : array_like
|
||||
The transformed array of points.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Whitening Transformation". Wikipedia.
|
||||
https://en.wikipedia.org/wiki/Whitening_transformation
|
||||
.. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
|
||||
coloring linear transformation". Transactions of VSB 18.2
|
||||
(2018): 31-35. :doi:`10.31490/tces-2018-0013`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng(1638083107694713882823079058616272161)
|
||||
>>> n = 3
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> cov_array = A @ A.T # make matrix symmetric positive definite
|
||||
>>> cholesky = np.linalg.cholesky(cov_array)
|
||||
>>> cov_object = stats.Covariance.from_cholesky(cholesky)
|
||||
>>> x = rng.multivariate_normal(np.zeros(n), np.eye(n), size=(10000))
|
||||
>>> x_ = cov_object.colorize(x)
|
||||
>>> cov_data = np.cov(x_, rowvar=False)
|
||||
>>> np.allclose(cov_data, cov_array, rtol=3e-2)
|
||||
True
|
||||
"""
|
||||
return self._colorize(np.asarray(x))
|
||||
|
||||
@property
|
||||
def log_pdet(self):
|
||||
"""
|
||||
Log of the pseudo-determinant of the covariance matrix
|
||||
"""
|
||||
return np.array(self._log_pdet, dtype=float)[()]
|
||||
|
||||
@property
|
||||
def rank(self):
|
||||
"""
|
||||
Rank of the covariance matrix
|
||||
"""
|
||||
return np.array(self._rank, dtype=int)[()]
|
||||
|
||||
@property
|
||||
def covariance(self):
|
||||
"""
|
||||
Explicit representation of the covariance matrix
|
||||
"""
|
||||
return self._covariance
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
"""
|
||||
Shape of the covariance array
|
||||
"""
|
||||
return self._shape
|
||||
|
||||
def _validate_matrix(self, A, name):
|
||||
A = np.atleast_2d(A)
|
||||
m, n = A.shape[-2:]
|
||||
if m != n or A.ndim != 2 or not (np.issubdtype(A.dtype, np.integer) or
|
||||
np.issubdtype(A.dtype, np.floating)):
|
||||
message = (f"The input `{name}` must be a square, "
|
||||
"two-dimensional array of real numbers.")
|
||||
raise ValueError(message)
|
||||
return A
|
||||
|
||||
def _validate_vector(self, A, name):
|
||||
A = np.atleast_1d(A)
|
||||
if A.ndim != 1 or not (np.issubdtype(A.dtype, np.integer) or
|
||||
np.issubdtype(A.dtype, np.floating)):
|
||||
message = (f"The input `{name}` must be a one-dimensional array "
|
||||
"of real numbers.")
|
||||
raise ValueError(message)
|
||||
return A
|
||||
|
||||
|
||||
class CovViaPrecision(Covariance):
|
||||
|
||||
def __init__(self, precision, covariance=None):
|
||||
precision = self._validate_matrix(precision, 'precision')
|
||||
if covariance is not None:
|
||||
covariance = self._validate_matrix(covariance, 'covariance')
|
||||
message = "`precision.shape` must equal `covariance.shape`."
|
||||
if precision.shape != covariance.shape:
|
||||
raise ValueError(message)
|
||||
|
||||
self._chol_P = np.linalg.cholesky(precision)
|
||||
self._log_pdet = -2*np.log(np.diag(self._chol_P)).sum(axis=-1)
|
||||
self._rank = precision.shape[-1] # must be full rank if invertible
|
||||
self._precision = precision
|
||||
self._cov_matrix = covariance
|
||||
self._shape = precision.shape
|
||||
self._allow_singular = False
|
||||
|
||||
def _whiten(self, x):
|
||||
return x @ self._chol_P
|
||||
|
||||
@cached_property
|
||||
def _covariance(self):
|
||||
n = self._shape[-1]
|
||||
return (linalg.cho_solve((self._chol_P, True), np.eye(n))
|
||||
if self._cov_matrix is None else self._cov_matrix)
|
||||
|
||||
def _colorize(self, x):
|
||||
return linalg.solve_triangular(self._chol_P.T, x.T, lower=False).T
|
||||
|
||||
|
||||
def _dot_diag(x, d):
|
||||
# If d were a full diagonal matrix, x @ d would always do what we want.
|
||||
# Special treatment is needed for n-dimensional `d` in which each row
|
||||
# includes only the diagonal elements of a covariance matrix.
|
||||
return x * d if x.ndim < 2 else x * np.expand_dims(d, -2)
|
||||
|
||||
|
||||
class CovViaDiagonal(Covariance):
|
||||
|
||||
def __init__(self, diagonal):
|
||||
diagonal = self._validate_vector(diagonal, 'diagonal')
|
||||
|
||||
i_zero = diagonal <= 0
|
||||
positive_diagonal = np.array(diagonal, dtype=np.float64)
|
||||
|
||||
positive_diagonal[i_zero] = 1 # ones don't affect determinant
|
||||
self._log_pdet = np.sum(np.log(positive_diagonal), axis=-1)
|
||||
|
||||
psuedo_reciprocals = 1 / np.sqrt(positive_diagonal)
|
||||
psuedo_reciprocals[i_zero] = 0
|
||||
|
||||
self._sqrt_diagonal = np.sqrt(diagonal)
|
||||
self._LP = psuedo_reciprocals
|
||||
self._rank = positive_diagonal.shape[-1] - i_zero.sum(axis=-1)
|
||||
self._covariance = np.apply_along_axis(np.diag, -1, diagonal)
|
||||
self._i_zero = i_zero
|
||||
self._shape = self._covariance.shape
|
||||
self._allow_singular = True
|
||||
|
||||
def _whiten(self, x):
|
||||
return _dot_diag(x, self._LP)
|
||||
|
||||
def _colorize(self, x):
|
||||
return _dot_diag(x, self._sqrt_diagonal)
|
||||
|
||||
def _support_mask(self, x):
|
||||
"""
|
||||
Check whether x lies in the support of the distribution.
|
||||
"""
|
||||
return ~np.any(_dot_diag(x, self._i_zero), axis=-1)
|
||||
|
||||
|
||||
class CovViaCholesky(Covariance):
|
||||
|
||||
def __init__(self, cholesky):
|
||||
L = self._validate_matrix(cholesky, 'cholesky')
|
||||
|
||||
self._factor = L
|
||||
self._log_pdet = 2*np.log(np.diag(self._factor)).sum(axis=-1)
|
||||
self._rank = L.shape[-1] # must be full rank for cholesky
|
||||
self._covariance = L @ L.T
|
||||
self._shape = L.shape
|
||||
self._allow_singular = False
|
||||
|
||||
def _whiten(self, x):
|
||||
res = linalg.solve_triangular(self._factor, x.T, lower=True).T
|
||||
return res
|
||||
|
||||
def _colorize(self, x):
|
||||
return x @ self._factor.T
|
||||
|
||||
|
||||
class CovViaEigendecomposition(Covariance):
|
||||
|
||||
def __init__(self, eigendecomposition):
|
||||
eigenvalues, eigenvectors = eigendecomposition
|
||||
eigenvalues = self._validate_vector(eigenvalues, 'eigenvalues')
|
||||
eigenvectors = self._validate_matrix(eigenvectors, 'eigenvectors')
|
||||
message = ("The shapes of `eigenvalues` and `eigenvectors` "
|
||||
"must be compatible.")
|
||||
try:
|
||||
eigenvalues = np.expand_dims(eigenvalues, -2)
|
||||
eigenvectors, eigenvalues = np.broadcast_arrays(eigenvectors,
|
||||
eigenvalues)
|
||||
eigenvalues = eigenvalues[..., 0, :]
|
||||
except ValueError:
|
||||
raise ValueError(message)
|
||||
|
||||
i_zero = eigenvalues <= 0
|
||||
positive_eigenvalues = np.array(eigenvalues, dtype=np.float64)
|
||||
|
||||
positive_eigenvalues[i_zero] = 1 # ones don't affect determinant
|
||||
self._log_pdet = np.sum(np.log(positive_eigenvalues), axis=-1)
|
||||
|
||||
psuedo_reciprocals = 1 / np.sqrt(positive_eigenvalues)
|
||||
psuedo_reciprocals[i_zero] = 0
|
||||
|
||||
self._LP = eigenvectors * psuedo_reciprocals
|
||||
self._LA = eigenvectors * np.sqrt(positive_eigenvalues)
|
||||
self._rank = positive_eigenvalues.shape[-1] - i_zero.sum(axis=-1)
|
||||
self._w = eigenvalues
|
||||
self._v = eigenvectors
|
||||
self._shape = eigenvectors.shape
|
||||
self._null_basis = eigenvectors * i_zero
|
||||
# This is only used for `_support_mask`, not to decide whether
|
||||
# the covariance is singular or not.
|
||||
self._eps = _multivariate._eigvalsh_to_eps(eigenvalues) * 10**3
|
||||
self._allow_singular = True
|
||||
|
||||
def _whiten(self, x):
|
||||
return x @ self._LP
|
||||
|
||||
def _colorize(self, x):
|
||||
return x @ self._LA.T
|
||||
|
||||
@cached_property
|
||||
def _covariance(self):
|
||||
return (self._v * self._w) @ self._v.T
|
||||
|
||||
def _support_mask(self, x):
|
||||
"""
|
||||
Check whether x lies in the support of the distribution.
|
||||
"""
|
||||
residual = np.linalg.norm(x @ self._null_basis, axis=-1)
|
||||
in_support = residual < self._eps
|
||||
return in_support
|
||||
|
||||
class CovViaPSD(Covariance):
|
||||
"""
|
||||
Representation of a covariance provided via an instance of _PSD
|
||||
"""
|
||||
|
||||
def __init__(self, psd):
|
||||
self._LP = psd.U
|
||||
self._log_pdet = psd.log_pdet
|
||||
self._rank = psd.rank
|
||||
self._covariance = psd._M
|
||||
self._shape = psd._M.shape
|
||||
self._psd = psd
|
||||
self._allow_singular = False # by default
|
||||
|
||||
def _whiten(self, x):
|
||||
return x @ self._LP
|
||||
|
||||
def _support_mask(self, x):
|
||||
return self._psd._support_mask(x)
|
||||
203
.CondaPkg/env/Lib/site-packages/scipy/stats/_crosstab.py
vendored
Normal file
203
.CondaPkg/env/Lib/site-packages/scipy/stats/_crosstab.py
vendored
Normal file
@@ -0,0 +1,203 @@
|
||||
import numpy as np
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy._lib._bunch import _make_tuple_bunch
|
||||
|
||||
|
||||
CrosstabResult = _make_tuple_bunch(
|
||||
"CrosstabResult", ["elements", "count"]
|
||||
)
|
||||
|
||||
def crosstab(*args, levels=None, sparse=False):
|
||||
"""
|
||||
Return table of counts for each possible unique combination in ``*args``.
|
||||
|
||||
When ``len(args) > 1``, the array computed by this function is
|
||||
often referred to as a *contingency table* [1]_.
|
||||
|
||||
The arguments must be sequences with the same length. The second return
|
||||
value, `count`, is an integer array with ``len(args)`` dimensions. If
|
||||
`levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk``
|
||||
is the number of unique elements in ``args[k]``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
*args : sequences
|
||||
A sequence of sequences whose unique aligned elements are to be
|
||||
counted. The sequences in args must all be the same length.
|
||||
levels : sequence, optional
|
||||
If `levels` is given, it must be a sequence that is the same length as
|
||||
`args`. Each element in `levels` is either a sequence or None. If it
|
||||
is a sequence, it gives the values in the corresponding sequence in
|
||||
`args` that are to be counted. If any value in the sequences in `args`
|
||||
does not occur in the corresponding sequence in `levels`, that value
|
||||
is ignored and not counted in the returned array `count`. The default
|
||||
value of `levels` for ``args[i]`` is ``np.unique(args[i])``
|
||||
sparse : bool, optional
|
||||
If True, return a sparse matrix. The matrix will be an instance of
|
||||
the `scipy.sparse.coo_matrix` class. Because SciPy's sparse matrices
|
||||
must be 2-d, only two input sequences are allowed when `sparse` is
|
||||
True. Default is False.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : CrosstabResult
|
||||
An object containing the following attributes:
|
||||
|
||||
elements : tuple of numpy.ndarrays.
|
||||
Tuple of length ``len(args)`` containing the arrays of elements
|
||||
that are counted in `count`. These can be interpreted as the
|
||||
labels of the corresponding dimensions of `count`. If `levels` was
|
||||
given, then if ``levels[i]`` is not None, ``elements[i]`` will
|
||||
hold the values given in ``levels[i]``.
|
||||
count : numpy.ndarray or scipy.sparse.coo_matrix
|
||||
Counts of the unique elements in ``zip(*args)``, stored in an
|
||||
array. Also known as a *contingency table* when ``len(args) > 1``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.unique
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats.contingency import crosstab
|
||||
|
||||
Given the lists `a` and `x`, create a contingency table that counts the
|
||||
frequencies of the corresponding pairs.
|
||||
|
||||
>>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
|
||||
>>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
|
||||
>>> res = crosstab(a, x)
|
||||
>>> avals, xvals = res.elements
|
||||
>>> avals
|
||||
array(['A', 'B'], dtype='<U1')
|
||||
>>> xvals
|
||||
array(['X', 'Y', 'Z'], dtype='<U1')
|
||||
>>> res.count
|
||||
array([[2, 3, 0],
|
||||
[1, 0, 4]])
|
||||
|
||||
So `('A', 'X')` occurs twice, `('A', 'Y')` occurs three times, etc.
|
||||
|
||||
Higher dimensional contingency tables can be created.
|
||||
|
||||
>>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
|
||||
>>> res = crosstab(a, x, p)
|
||||
>>> res.count
|
||||
array([[[2, 0],
|
||||
[2, 1],
|
||||
[0, 0]],
|
||||
[[1, 0],
|
||||
[0, 0],
|
||||
[1, 3]]])
|
||||
>>> res.count.shape
|
||||
(2, 3, 2)
|
||||
|
||||
The values to be counted can be set by using the `levels` argument.
|
||||
It allows the elements of interest in each input sequence to be
|
||||
given explicitly instead finding the unique elements of the sequence.
|
||||
|
||||
For example, suppose one of the arguments is an array containing the
|
||||
answers to a survey question, with integer values 1 to 4. Even if the
|
||||
value 1 does not occur in the data, we want an entry for it in the table.
|
||||
|
||||
>>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4] # 1 does not occur.
|
||||
>>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4] # 3 does not occur.
|
||||
>>> options = [1, 2, 3, 4]
|
||||
>>> res = crosstab(q1, q2, levels=(options, options))
|
||||
>>> res.count
|
||||
array([[0, 0, 0, 0],
|
||||
[1, 1, 0, 1],
|
||||
[1, 4, 0, 1],
|
||||
[0, 3, 0, 3]])
|
||||
|
||||
If `levels` is given, but an element of `levels` is None, the unique values
|
||||
of the corresponding argument are used. For example,
|
||||
|
||||
>>> res = crosstab(q1, q2, levels=(None, options))
|
||||
>>> res.elements
|
||||
[array([2, 3, 4]), [1, 2, 3, 4]]
|
||||
>>> res.count
|
||||
array([[1, 1, 0, 1],
|
||||
[1, 4, 0, 1],
|
||||
[0, 3, 0, 3]])
|
||||
|
||||
If we want to ignore the pairs where 4 occurs in ``q2``, we can
|
||||
give just the values [1, 2] to `levels`, and the 4 will be ignored:
|
||||
|
||||
>>> res = crosstab(q1, q2, levels=(None, [1, 2]))
|
||||
>>> res.elements
|
||||
[array([2, 3, 4]), [1, 2]]
|
||||
>>> res.count
|
||||
array([[1, 1],
|
||||
[1, 4],
|
||||
[0, 3]])
|
||||
|
||||
Finally, let's repeat the first example, but return a sparse matrix:
|
||||
|
||||
>>> res = crosstab(a, x, sparse=True)
|
||||
>>> res.count
|
||||
<2x3 sparse matrix of type '<class 'numpy.int64'>'
|
||||
with 4 stored elements in COOrdinate format>
|
||||
>>> res.count.A
|
||||
array([[2, 3, 0],
|
||||
[1, 0, 4]])
|
||||
|
||||
"""
|
||||
nargs = len(args)
|
||||
if nargs == 0:
|
||||
raise TypeError("At least one input sequence is required.")
|
||||
|
||||
len0 = len(args[0])
|
||||
if not all(len(a) == len0 for a in args[1:]):
|
||||
raise ValueError("All input sequences must have the same length.")
|
||||
|
||||
if sparse and nargs != 2:
|
||||
raise ValueError("When `sparse` is True, only two input sequences "
|
||||
"are allowed.")
|
||||
|
||||
if levels is None:
|
||||
# Call np.unique with return_inverse=True on each argument.
|
||||
actual_levels, indices = zip(*[np.unique(a, return_inverse=True)
|
||||
for a in args])
|
||||
else:
|
||||
# `levels` is not None...
|
||||
if len(levels) != nargs:
|
||||
raise ValueError('len(levels) must equal the number of input '
|
||||
'sequences')
|
||||
|
||||
args = [np.asarray(arg) for arg in args]
|
||||
mask = np.zeros((nargs, len0), dtype=np.bool_)
|
||||
inv = np.zeros((nargs, len0), dtype=np.intp)
|
||||
actual_levels = []
|
||||
for k, (levels_list, arg) in enumerate(zip(levels, args)):
|
||||
if levels_list is None:
|
||||
levels_list, inv[k, :] = np.unique(arg, return_inverse=True)
|
||||
mask[k, :] = True
|
||||
else:
|
||||
q = arg == np.asarray(levels_list).reshape(-1, 1)
|
||||
mask[k, :] = np.any(q, axis=0)
|
||||
qnz = q.T.nonzero()
|
||||
inv[k, qnz[0]] = qnz[1]
|
||||
actual_levels.append(levels_list)
|
||||
|
||||
mask_all = mask.all(axis=0)
|
||||
indices = tuple(inv[:, mask_all])
|
||||
|
||||
if sparse:
|
||||
count = coo_matrix((np.ones(len(indices[0]), dtype=int),
|
||||
(indices[0], indices[1])))
|
||||
count.sum_duplicates()
|
||||
else:
|
||||
shape = [len(u) for u in actual_levels]
|
||||
count = np.zeros(shape, dtype=int)
|
||||
np.add.at(count, indices, 1)
|
||||
|
||||
return CrosstabResult(actual_levels, count)
|
||||
1814
.CondaPkg/env/Lib/site-packages/scipy/stats/_discrete_distns.py
vendored
Normal file
1814
.CondaPkg/env/Lib/site-packages/scipy/stats/_discrete_distns.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
4062
.CondaPkg/env/Lib/site-packages/scipy/stats/_distn_infrastructure.py
vendored
Normal file
4062
.CondaPkg/env/Lib/site-packages/scipy/stats/_distn_infrastructure.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
281
.CondaPkg/env/Lib/site-packages/scipy/stats/_distr_params.py
vendored
Normal file
281
.CondaPkg/env/Lib/site-packages/scipy/stats/_distr_params.py
vendored
Normal file
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
Sane parameters for stats.distributions.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
distcont = [
|
||||
['alpha', (3.5704770516650459,)],
|
||||
['anglit', ()],
|
||||
['arcsine', ()],
|
||||
['argus', (1.0,)],
|
||||
['beta', (2.3098496451481823, 0.62687954300963677)],
|
||||
['betaprime', (5, 6)],
|
||||
['bradford', (0.29891359763170633,)],
|
||||
['burr', (10.5, 4.3)],
|
||||
['burr12', (10, 4)],
|
||||
['cauchy', ()],
|
||||
['chi', (78,)],
|
||||
['chi2', (55,)],
|
||||
['cosine', ()],
|
||||
['crystalball', (2.0, 3.0)],
|
||||
['dgamma', (1.1023326088288166,)],
|
||||
['dweibull', (2.0685080649914673,)],
|
||||
['erlang', (10,)],
|
||||
['expon', ()],
|
||||
['exponnorm', (1.5,)],
|
||||
['exponpow', (2.697119160358469,)],
|
||||
['exponweib', (2.8923945291034436, 1.9505288745913174)],
|
||||
['f', (29, 18)],
|
||||
['fatiguelife', (29,)], # correction numargs = 1
|
||||
['fisk', (3.0857548622253179,)],
|
||||
['foldcauchy', (4.7164673455831894,)],
|
||||
['foldnorm', (1.9521253373555869,)],
|
||||
['gamma', (1.9932305483800778,)],
|
||||
['gausshyper', (13.763771604130699, 3.1189636648681431,
|
||||
2.5145980350183019, 5.1811649903971615)], # veryslow
|
||||
['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
|
||||
['genextreme', (-0.1,)],
|
||||
['gengamma', (4.4162385429431925, 3.1193091679242761)],
|
||||
['gengamma', (4.4162385429431925, -3.1193091679242761)],
|
||||
['genhalflogistic', (0.77274727809929322,)],
|
||||
['genhyperbolic', (0.5, 1.5, -0.5,)],
|
||||
['geninvgauss', (2.3, 1.5)],
|
||||
['genlogistic', (0.41192440799679475,)],
|
||||
['gennorm', (1.2988442399460265,)],
|
||||
['halfgennorm', (0.6748054997000371,)],
|
||||
['genpareto', (0.1,)], # use case with finite moments
|
||||
['gibrat', ()],
|
||||
['gompertz', (0.94743713075105251,)],
|
||||
['gumbel_l', ()],
|
||||
['gumbel_r', ()],
|
||||
['halfcauchy', ()],
|
||||
['halflogistic', ()],
|
||||
['halfnorm', ()],
|
||||
['hypsecant', ()],
|
||||
['invgamma', (4.0668996136993067,)],
|
||||
['invgauss', (0.14546264555347513,)],
|
||||
['invweibull', (10.58,)],
|
||||
['johnsonsb', (4.3172675099141058, 3.1837781130785063)],
|
||||
['johnsonsu', (2.554395574161155, 2.2482281679651965)],
|
||||
['kappa4', (0.0, 0.0)],
|
||||
['kappa4', (-0.1, 0.1)],
|
||||
['kappa4', (0.0, 0.1)],
|
||||
['kappa4', (0.1, 0.0)],
|
||||
['kappa3', (1.0,)],
|
||||
['ksone', (1000,)], # replace 22 by 100 to avoid failing range, ticket 956
|
||||
['kstwo', (10,)],
|
||||
['kstwobign', ()],
|
||||
['laplace', ()],
|
||||
['laplace_asymmetric', (2,)],
|
||||
['levy', ()],
|
||||
['levy_l', ()],
|
||||
['levy_stable', (1.8, -0.5)],
|
||||
['loggamma', (0.41411931826052117,)],
|
||||
['logistic', ()],
|
||||
['loglaplace', (3.2505926592051435,)],
|
||||
['lognorm', (0.95368226960575331,)],
|
||||
['loguniform', (0.01, 1.25)],
|
||||
['lomax', (1.8771398388773268,)],
|
||||
['maxwell', ()],
|
||||
['mielke', (10.4, 4.6)],
|
||||
['moyal', ()],
|
||||
['nakagami', (4.9673794866666237,)],
|
||||
['ncf', (27, 27, 0.41578441799226107)],
|
||||
['nct', (14, 0.24045031331198066)],
|
||||
['ncx2', (21, 1.0560465975116415)],
|
||||
['norm', ()],
|
||||
['norminvgauss', (1.25, 0.5)],
|
||||
['pareto', (2.621716532144454,)],
|
||||
['pearson3', (0.1,)],
|
||||
['pearson3', (-2,)],
|
||||
['powerlaw', (1.6591133289905851,)],
|
||||
['powerlaw', (0.6591133289905851,)],
|
||||
['powerlognorm', (2.1413923530064087, 0.44639540782048337)],
|
||||
['powernorm', (4.4453652254590779,)],
|
||||
['rayleigh', ()],
|
||||
['rdist', (1.6,)],
|
||||
['recipinvgauss', (0.63004267809369119,)],
|
||||
['reciprocal', (0.01, 1.25)],
|
||||
['rice', (0.7749725210111873,)],
|
||||
['semicircular', ()],
|
||||
['skewcauchy', (0.5,)],
|
||||
['skewnorm', (4.0,)],
|
||||
['studentized_range', (3.0, 10.0)],
|
||||
['t', (2.7433514990818093,)],
|
||||
['trapezoid', (0.2, 0.8)],
|
||||
['triang', (0.15785029824528218,)],
|
||||
['truncexpon', (4.6907725456810478,)],
|
||||
['truncnorm', (-1.0978730080013919, 2.7306754109031979)],
|
||||
['truncnorm', (0.1, 2.)],
|
||||
['truncpareto', (1.8, 5.3)],
|
||||
['truncweibull_min', (2.5, 0.25, 1.75)],
|
||||
['tukeylambda', (3.1321477856738267,)],
|
||||
['uniform', ()],
|
||||
['vonmises', (3.9939042581071398,)],
|
||||
['vonmises_line', (3.9939042581071398,)],
|
||||
['wald', ()],
|
||||
['weibull_max', (2.8687961709100187,)],
|
||||
['weibull_min', (1.7866166930421596,)],
|
||||
['wrapcauchy', (0.031071279018614728,)]]
|
||||
|
||||
|
||||
distdiscrete = [
|
||||
['bernoulli',(0.3,)],
|
||||
['betabinom', (5, 2.3, 0.63)],
|
||||
['binom', (5, 0.4)],
|
||||
['boltzmann',(1.4, 19)],
|
||||
['dlaplace', (0.8,)], # 0.5
|
||||
['geom', (0.5,)],
|
||||
['hypergeom',(30, 12, 6)],
|
||||
['hypergeom',(21,3,12)], # numpy.random (3,18,12) numpy ticket:921
|
||||
['hypergeom',(21,18,11)], # numpy.random (18,3,11) numpy ticket:921
|
||||
['nchypergeom_fisher', (140, 80, 60, 0.5)],
|
||||
['nchypergeom_wallenius', (140, 80, 60, 0.5)],
|
||||
['logser', (0.6,)], # re-enabled, numpy ticket:921
|
||||
['nbinom', (0.4, 0.4)], # from tickets: 583
|
||||
['nbinom', (5, 0.5)],
|
||||
['planck', (0.51,)], # 4.1
|
||||
['poisson', (0.6,)],
|
||||
['randint', (7, 31)],
|
||||
['skellam', (15, 8)],
|
||||
['zipf', (6.5,)],
|
||||
['zipfian', (0.75, 15)],
|
||||
['zipfian', (1.25, 10)],
|
||||
['yulesimon', (11.0,)],
|
||||
['nhypergeom', (20, 7, 1)]
|
||||
]
|
||||
|
||||
|
||||
invdistdiscrete = [
|
||||
# In each of the following, at least one shape parameter is invalid
|
||||
['hypergeom', (3, 3, 4)],
|
||||
['nhypergeom', (5, 2, 8)],
|
||||
['nchypergeom_fisher', (3, 3, 4, 1)],
|
||||
['nchypergeom_wallenius', (3, 3, 4, 1)],
|
||||
['bernoulli', (1.5, )],
|
||||
['binom', (10, 1.5)],
|
||||
['betabinom', (10, -0.4, -0.5)],
|
||||
['boltzmann', (-1, 4)],
|
||||
['dlaplace', (-0.5, )],
|
||||
['geom', (1.5, )],
|
||||
['logser', (1.5, )],
|
||||
['nbinom', (10, 1.5)],
|
||||
['planck', (-0.5, )],
|
||||
['poisson', (-0.5, )],
|
||||
['randint', (5, 2)],
|
||||
['skellam', (-5, -2)],
|
||||
['zipf', (-2, )],
|
||||
['yulesimon', (-2, )],
|
||||
['zipfian', (-0.75, 15)]
|
||||
]
|
||||
|
||||
|
||||
invdistcont = [
|
||||
# In each of the following, at least one shape parameter is invalid
|
||||
['alpha', (-1, )],
|
||||
['anglit', ()],
|
||||
['arcsine', ()],
|
||||
['argus', (-1, )],
|
||||
['beta', (-2, 2)],
|
||||
['betaprime', (-2, 2)],
|
||||
['bradford', (-1, )],
|
||||
['burr', (-1, 1)],
|
||||
['burr12', (-1, 1)],
|
||||
['cauchy', ()],
|
||||
['chi', (-1, )],
|
||||
['chi2', (-1, )],
|
||||
['cosine', ()],
|
||||
['crystalball', (-1, 2)],
|
||||
['dgamma', (-1, )],
|
||||
['dweibull', (-1, )],
|
||||
['erlang', (-1, )],
|
||||
['expon', ()],
|
||||
['exponnorm', (-1, )],
|
||||
['exponweib', (1, -1)],
|
||||
['exponpow', (-1, )],
|
||||
['f', (10, -10)],
|
||||
['fatiguelife', (-1, )],
|
||||
['fisk', (-1, )],
|
||||
['foldcauchy', (-1, )],
|
||||
['foldnorm', (-1, )],
|
||||
['genlogistic', (-1, )],
|
||||
['gennorm', (-1, )],
|
||||
['genpareto', (np.inf, )],
|
||||
['genexpon', (1, 2, -3)],
|
||||
['genextreme', (np.inf, )],
|
||||
['genhyperbolic', (0.5, -0.5, -1.5,)],
|
||||
['gausshyper', (1, 2, 3, -4)],
|
||||
['gamma', (-1, )],
|
||||
['gengamma', (-1, 0)],
|
||||
['genhalflogistic', (-1, )],
|
||||
['geninvgauss', (1, 0)],
|
||||
['gibrat', ()],
|
||||
['gompertz', (-1, )],
|
||||
['gumbel_r', ()],
|
||||
['gumbel_l', ()],
|
||||
['halfcauchy', ()],
|
||||
['halflogistic', ()],
|
||||
['halfnorm', ()],
|
||||
['halfgennorm', (-1, )],
|
||||
['hypsecant', ()],
|
||||
['invgamma', (-1, )],
|
||||
['invgauss', (-1, )],
|
||||
['invweibull', (-1, )],
|
||||
['johnsonsb', (1, -2)],
|
||||
['johnsonsu', (1, -2)],
|
||||
['kappa4', (np.nan, 0)],
|
||||
['kappa3', (-1, )],
|
||||
['ksone', (-1, )],
|
||||
['kstwo', (-1, )],
|
||||
['kstwobign', ()],
|
||||
['laplace', ()],
|
||||
['laplace_asymmetric', (-1, )],
|
||||
['levy', ()],
|
||||
['levy_l', ()],
|
||||
['levy_stable', (-1, 1)],
|
||||
['logistic', ()],
|
||||
['loggamma', (-1, )],
|
||||
['loglaplace', (-1, )],
|
||||
['lognorm', (-1, )],
|
||||
['loguniform', (10, 5)],
|
||||
['lomax', (-1, )],
|
||||
['maxwell', ()],
|
||||
['mielke', (1, -2)],
|
||||
['moyal', ()],
|
||||
['nakagami', (-1, )],
|
||||
['ncx2', (-1, 2)],
|
||||
['ncf', (10, 20, -1)],
|
||||
['nct', (-1, 2)],
|
||||
['norm', ()],
|
||||
['norminvgauss', (5, -10)],
|
||||
['pareto', (-1, )],
|
||||
['pearson3', (np.nan, )],
|
||||
['powerlaw', (-1, )],
|
||||
['powerlognorm', (1, -2)],
|
||||
['powernorm', (-1, )],
|
||||
['rdist', (-1, )],
|
||||
['rayleigh', ()],
|
||||
['rice', (-1, )],
|
||||
['recipinvgauss', (-1, )],
|
||||
['semicircular', ()],
|
||||
['skewnorm', (np.inf, )],
|
||||
['studentized_range', (-1, 1)],
|
||||
['t', (-1, )],
|
||||
['trapezoid', (0, 2)],
|
||||
['triang', (2, )],
|
||||
['truncexpon', (-1, )],
|
||||
['truncnorm', (10, 5)],
|
||||
['truncpareto', (-1, 5)],
|
||||
['truncpareto', (1.8, .5)],
|
||||
['truncweibull_min', (-2.5, 0.25, 1.75)],
|
||||
['tukeylambda', (np.nan, )],
|
||||
['uniform', ()],
|
||||
['vonmises', (-1, )],
|
||||
['vonmises_line', (-1, )],
|
||||
['wald', ()],
|
||||
['weibull_min', (-1, )],
|
||||
['weibull_max', (-1, )],
|
||||
['wrapcauchy', (2, )],
|
||||
['reciprocal', (15, 10)],
|
||||
['skewcauchy', (2, )]
|
||||
]
|
||||
399
.CondaPkg/env/Lib/site-packages/scipy/stats/_entropy.py
vendored
Normal file
399
.CondaPkg/env/Lib/site-packages/scipy/stats/_entropy.py
vendored
Normal file
@@ -0,0 +1,399 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Apr 2 09:06:05 2021
|
||||
|
||||
@author: matth
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import math
|
||||
import numpy as np
|
||||
from scipy import special
|
||||
from typing import Optional, Union
|
||||
|
||||
__all__ = ['entropy', 'differential_entropy']
|
||||
|
||||
|
||||
def entropy(pk: np.typing.ArrayLike,
|
||||
qk: Optional[np.typing.ArrayLike] = None,
|
||||
base: Optional[float] = None,
|
||||
axis: int = 0
|
||||
) -> Union[np.number, np.ndarray]:
|
||||
"""
|
||||
Calculate the Shannon entropy/relative entropy of given distribution(s).
|
||||
|
||||
If only probabilities `pk` are given, the Shannon entropy is calculated as
|
||||
``H = -sum(pk * log(pk))``.
|
||||
|
||||
If `qk` is not None, then compute the relative entropy
|
||||
``D = sum(pk * log(pk / qk))``. This quantity is also known
|
||||
as the Kullback-Leibler divergence.
|
||||
|
||||
This routine will normalize `pk` and `qk` if they don't sum to 1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pk : array_like
|
||||
Defines the (discrete) distribution. Along each axis-slice of ``pk``,
|
||||
element ``i`` is the (possibly unnormalized) probability of event
|
||||
``i``.
|
||||
qk : array_like, optional
|
||||
Sequence against which the relative entropy is computed. Should be in
|
||||
the same format as `pk`.
|
||||
base : float, optional
|
||||
The logarithmic base to use, defaults to ``e`` (natural logarithm).
|
||||
axis : int, optional
|
||||
The axis along which the entropy is calculated. Default is 0.
|
||||
|
||||
Returns
|
||||
-------
|
||||
S : {float, array_like}
|
||||
The calculated entropy.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Informally, the Shannon entropy quantifies the expected uncertainty
|
||||
inherent in the possible outcomes of a discrete random variable.
|
||||
For example,
|
||||
if messages consisting of sequences of symbols from a set are to be
|
||||
encoded and transmitted over a noiseless channel, then the Shannon entropy
|
||||
``H(pk)`` gives a tight lower bound for the average number of units of
|
||||
information needed per symbol if the symbols occur with frequencies
|
||||
governed by the discrete distribution `pk` [1]_. The choice of base
|
||||
determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
|
||||
|
||||
The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
|
||||
number of units of information needed per symbol if the encoding is
|
||||
optimized for the probability distribution `qk` instead of the true
|
||||
distribution `pk`. Informally, the relative entropy quantifies the expected
|
||||
excess in surprise experienced if one believes the true distribution is
|
||||
`qk` when it is actually `pk`.
|
||||
|
||||
A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
|
||||
equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
|
||||
the formula ``CE = -sum(pk * log(qk))``. It gives the average
|
||||
number of units of information needed per symbol if an encoding is
|
||||
optimized for the probability distribution `qk` when the true distribution
|
||||
is `pk`. It is not computed directly by `entropy`, but it can be computed
|
||||
using two calls to the function (see Examples).
|
||||
|
||||
See [2]_ for more information.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
|
||||
Bell System Technical Journal, 27: 379-423.
|
||||
https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
|
||||
.. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
|
||||
Theory (Wiley Series in Telecommunications and Signal Processing).
|
||||
Wiley-Interscience, USA.
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
The outcome of a fair coin is the most uncertain:
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import entropy
|
||||
>>> base = 2 # work in units of bits
|
||||
>>> pk = np.array([1/2, 1/2]) # fair coin
|
||||
>>> H = entropy(pk, base=base)
|
||||
>>> H
|
||||
1.0
|
||||
>>> H == -np.sum(pk * np.log(pk)) / np.log(base)
|
||||
True
|
||||
|
||||
The outcome of a biased coin is less uncertain:
|
||||
|
||||
>>> qk = np.array([9/10, 1/10]) # biased coin
|
||||
>>> entropy(qk, base=base)
|
||||
0.46899559358928117
|
||||
|
||||
The relative entropy between the fair coin and biased coin is calculated
|
||||
as:
|
||||
|
||||
>>> D = entropy(pk, qk, base=base)
|
||||
>>> D
|
||||
0.7369655941662062
|
||||
>>> D == np.sum(pk * np.log(pk/qk)) / np.log(base)
|
||||
True
|
||||
|
||||
The cross entropy can be calculated as the sum of the entropy and
|
||||
relative entropy`:
|
||||
|
||||
>>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
|
||||
>>> CE
|
||||
1.736965594166206
|
||||
>>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
|
||||
True
|
||||
|
||||
"""
|
||||
if base is not None and base <= 0:
|
||||
raise ValueError("`base` must be a positive number or `None`.")
|
||||
|
||||
pk = np.asarray(pk)
|
||||
pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
|
||||
if qk is None:
|
||||
vec = special.entr(pk)
|
||||
else:
|
||||
qk = np.asarray(qk)
|
||||
pk, qk = np.broadcast_arrays(pk, qk)
|
||||
qk = 1.0*qk / np.sum(qk, axis=axis, keepdims=True)
|
||||
vec = special.rel_entr(pk, qk)
|
||||
S = np.sum(vec, axis=axis)
|
||||
if base is not None:
|
||||
S /= np.log(base)
|
||||
return S
|
||||
|
||||
|
||||
def differential_entropy(
|
||||
values: np.typing.ArrayLike,
|
||||
*,
|
||||
window_length: Optional[int] = None,
|
||||
base: Optional[float] = None,
|
||||
axis: int = 0,
|
||||
method: str = "auto",
|
||||
) -> Union[np.number, np.ndarray]:
|
||||
r"""Given a sample of a distribution, estimate the differential entropy.
|
||||
|
||||
Several estimation methods are available using the `method` parameter. By
|
||||
default, a method is selected based the size of the sample.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : sequence
|
||||
Sample from a continuous distribution.
|
||||
window_length : int, optional
|
||||
Window length for computing Vasicek estimate. Must be an integer
|
||||
between 1 and half of the sample size. If ``None`` (the default), it
|
||||
uses the heuristic value
|
||||
|
||||
.. math::
|
||||
\left \lfloor \sqrt{n} + 0.5 \right \rfloor
|
||||
|
||||
where :math:`n` is the sample size. This heuristic was originally
|
||||
proposed in [2]_ and has become common in the literature.
|
||||
base : float, optional
|
||||
The logarithmic base to use, defaults to ``e`` (natural logarithm).
|
||||
axis : int, optional
|
||||
The axis along which the differential entropy is calculated.
|
||||
Default is 0.
|
||||
method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
|
||||
The method used to estimate the differential entropy from the sample.
|
||||
Default is ``'auto'``. See Notes for more information.
|
||||
|
||||
Returns
|
||||
-------
|
||||
entropy : float
|
||||
The calculated differential entropy.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function will converge to the true differential entropy in the limit
|
||||
|
||||
.. math::
|
||||
n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
|
||||
|
||||
The optimal choice of ``window_length`` for a given sample size depends on
|
||||
the (unknown) distribution. Typically, the smoother the density of the
|
||||
distribution, the larger the optimal value of ``window_length`` [1]_.
|
||||
|
||||
The following options are available for the `method` parameter.
|
||||
|
||||
* ``'vasicek'`` uses the estimator presented in [1]_. This is
|
||||
one of the first and most influential estimators of differential entropy.
|
||||
* ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
|
||||
is not only consistent but, under some conditions, asymptotically normal.
|
||||
* ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
|
||||
in simulation to have smaller bias and mean squared error than
|
||||
the Vasicek estimator.
|
||||
* ``'correa'`` uses the estimator presented in [5]_ based on local linear
|
||||
regression. In a simulation study, it had consistently smaller mean
|
||||
square error than the Vasiceck estimator, but it is more expensive to
|
||||
compute.
|
||||
* ``'auto'`` selects the method automatically (default). Currently,
|
||||
this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
|
||||
for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
|
||||
samples, but this behavior is subject to change in future versions.
|
||||
|
||||
All estimators are implemented as described in [6]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
|
||||
Journal of the Royal Statistical Society:
|
||||
Series B (Methodological), 38(1), 54-59.
|
||||
.. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
|
||||
goodness-of-fit test for exponentiality. Communications in
|
||||
Statistics-Theory and Methods, 28(5), 1183-1202.
|
||||
.. [3] Van Es, B. (1992). Estimating functionals related to a density by a
|
||||
class of statistics based on spacings. Scandinavian Journal of
|
||||
Statistics, 61-72.
|
||||
.. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
|
||||
of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
|
||||
.. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
|
||||
in Statistics-Theory and Methods, 24(10), 2439-2449.
|
||||
.. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
|
||||
Annals of Data Science, 2(2), 231-241.
|
||||
https://link.springer.com/article/10.1007/s40745-015-0045-9
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import differential_entropy, norm
|
||||
|
||||
Entropy of a standard normal distribution:
|
||||
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> values = rng.standard_normal(100)
|
||||
>>> differential_entropy(values)
|
||||
1.3407817436640392
|
||||
|
||||
Compare with the true entropy:
|
||||
|
||||
>>> float(norm.entropy())
|
||||
1.4189385332046727
|
||||
|
||||
For several sample sizes between 5 and 1000, compare the accuracy of
|
||||
the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
|
||||
compare the root mean squared error (over 1000 trials) between the estimate
|
||||
and the true differential entropy of the distribution.
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>>
|
||||
>>>
|
||||
>>> def rmse(res, expected):
|
||||
... '''Root mean squared error'''
|
||||
... return np.sqrt(np.mean((res - expected)**2))
|
||||
>>>
|
||||
>>>
|
||||
>>> a, b = np.log10(5), np.log10(1000)
|
||||
>>> ns = np.round(np.logspace(a, b, 10)).astype(int)
|
||||
>>> reps = 1000 # number of repetitions for each sample size
|
||||
>>> expected = stats.expon.entropy()
|
||||
>>>
|
||||
>>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
|
||||
>>> for method in method_errors:
|
||||
... for n in ns:
|
||||
... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
|
||||
... res = stats.differential_entropy(rvs, method=method, axis=-1)
|
||||
... error = rmse(res, expected)
|
||||
... method_errors[method].append(error)
|
||||
>>>
|
||||
>>> for method, errors in method_errors.items():
|
||||
... plt.loglog(ns, errors, label=method)
|
||||
>>>
|
||||
>>> plt.legend()
|
||||
>>> plt.xlabel('sample size')
|
||||
>>> plt.ylabel('RMSE (1000 trials)')
|
||||
>>> plt.title('Entropy Estimator Error (Exponential Distribution)')
|
||||
|
||||
"""
|
||||
values = np.asarray(values)
|
||||
values = np.moveaxis(values, axis, -1)
|
||||
n = values.shape[-1] # number of observations
|
||||
|
||||
if window_length is None:
|
||||
window_length = math.floor(math.sqrt(n) + 0.5)
|
||||
|
||||
if not 2 <= 2 * window_length < n:
|
||||
raise ValueError(
|
||||
f"Window length ({window_length}) must be positive and less "
|
||||
f"than half the sample size ({n}).",
|
||||
)
|
||||
|
||||
if base is not None and base <= 0:
|
||||
raise ValueError("`base` must be a positive number or `None`.")
|
||||
|
||||
sorted_data = np.sort(values, axis=-1)
|
||||
|
||||
methods = {"vasicek": _vasicek_entropy,
|
||||
"van es": _van_es_entropy,
|
||||
"correa": _correa_entropy,
|
||||
"ebrahimi": _ebrahimi_entropy,
|
||||
"auto": _vasicek_entropy}
|
||||
method = method.lower()
|
||||
if method not in methods:
|
||||
message = f"`method` must be one of {set(methods)}"
|
||||
raise ValueError(message)
|
||||
|
||||
if method == "auto":
|
||||
if n <= 10:
|
||||
method = 'van es'
|
||||
elif n <= 1000:
|
||||
method = 'ebrahimi'
|
||||
else:
|
||||
method = 'vasicek'
|
||||
|
||||
res = methods[method](sorted_data, window_length)
|
||||
|
||||
if base is not None:
|
||||
res /= np.log(base)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _pad_along_last_axis(X, m):
|
||||
"""Pad the data for computing the rolling window difference."""
|
||||
# scales a bit better than method in _vasicek_like_entropy
|
||||
shape = np.array(X.shape)
|
||||
shape[-1] = m
|
||||
Xl = np.broadcast_to(X[..., [0]], shape) # [0] vs 0 to maintain shape
|
||||
Xr = np.broadcast_to(X[..., [-1]], shape)
|
||||
return np.concatenate((Xl, X, Xr), axis=-1)
|
||||
|
||||
|
||||
def _vasicek_entropy(X, m):
|
||||
"""Compute the Vasicek estimator as described in [6] Eq. 1.3."""
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m)
|
||||
differences = X[..., 2 * m:] - X[..., : -2 * m:]
|
||||
logs = np.log(n/(2*m) * differences)
|
||||
return np.mean(logs, axis=-1)
|
||||
|
||||
|
||||
def _van_es_entropy(X, m):
|
||||
"""Compute the van Es estimator as described in [6]."""
|
||||
# No equation number, but referred to as HVE_mn.
|
||||
# Typo: there should be a log within the summation.
|
||||
n = X.shape[-1]
|
||||
difference = X[..., m:] - X[..., :-m]
|
||||
term1 = 1/(n-m) * np.sum(np.log((n+1)/m * difference), axis=-1)
|
||||
k = np.arange(m, n+1)
|
||||
return term1 + np.sum(1/k) + np.log(m) - np.log(n+1)
|
||||
|
||||
|
||||
def _ebrahimi_entropy(X, m):
|
||||
"""Compute the Ebrahimi estimator as described in [6]."""
|
||||
# No equation number, but referred to as HE_mn
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m)
|
||||
|
||||
differences = X[..., 2 * m:] - X[..., : -2 * m:]
|
||||
|
||||
i = np.arange(1, n+1).astype(float)
|
||||
ci = np.ones_like(i)*2
|
||||
ci[i <= m] = 1 + (i[i <= m] - 1)/m
|
||||
ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m
|
||||
|
||||
logs = np.log(n * differences / (ci * m))
|
||||
return np.mean(logs, axis=-1)
|
||||
|
||||
|
||||
def _correa_entropy(X, m):
|
||||
"""Compute the Correa estimator as described in [6]."""
|
||||
# No equation number, but referred to as HC_mn
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m)
|
||||
|
||||
i = np.arange(1, n+1)
|
||||
dj = np.arange(-m, m+1)[:, None]
|
||||
j = i + dj
|
||||
j0 = j + m - 1 # 0-indexed version of j
|
||||
|
||||
Xibar = np.mean(X[..., j0], axis=-2, keepdims=True)
|
||||
difference = X[..., j0] - Xibar
|
||||
num = np.sum(difference*dj, axis=-2) # dj is d-i
|
||||
den = n*np.sum(difference**2, axis=-2)
|
||||
return -np.mean(np.log(num/den), axis=-1)
|
||||
1284
.CondaPkg/env/Lib/site-packages/scipy/stats/_fit.py
vendored
Normal file
1284
.CondaPkg/env/Lib/site-packages/scipy/stats/_fit.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
75
.CondaPkg/env/Lib/site-packages/scipy/stats/_generate_pyx.py
vendored
Normal file
75
.CondaPkg/env/Lib/site-packages/scipy/stats/_generate_pyx.py
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
import pathlib
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
|
||||
|
||||
def isNPY_OLD():
|
||||
'''
|
||||
A new random C API was added in 1.18 and became stable in 1.19.
|
||||
Prefer the new random C API when building with recent numpy.
|
||||
'''
|
||||
import numpy as np
|
||||
ver = tuple(int(num) for num in np.__version__.split('.')[:2])
|
||||
return ver < (1, 19)
|
||||
|
||||
|
||||
def make_biasedurn(outdir):
|
||||
'''Substitute True/False values for NPY_OLD Cython build variable.'''
|
||||
biasedurn_base = (pathlib.Path(__file__).parent / '_biasedurn').absolute()
|
||||
with open(biasedurn_base.with_suffix('.pyx.templ'), 'r') as src:
|
||||
contents = src.read()
|
||||
|
||||
outfile = outdir / '_biasedurn.pyx'
|
||||
with open(outfile, 'w') as dest:
|
||||
dest.write(contents.format(NPY_OLD=str(bool(isNPY_OLD()))))
|
||||
|
||||
|
||||
def make_unuran(srcdir, outdir):
|
||||
"""Substitute True/False values for NPY_OLD Cython build variable."""
|
||||
import re
|
||||
with open(srcdir / "unuran_wrapper.pyx.templ", "r") as src:
|
||||
contents = src.read()
|
||||
with open(outdir / "unuran_wrapper.pyx", "w") as dest:
|
||||
dest.write(re.sub("DEF NPY_OLD = isNPY_OLD",
|
||||
f"DEF NPY_OLD = {isNPY_OLD()}",
|
||||
contents))
|
||||
|
||||
|
||||
def make_boost(outdir, distutils_build=False):
|
||||
# Call code generator inside _boost directory
|
||||
code_gen = pathlib.Path(__file__).parent / '_boost/include/code_gen.py'
|
||||
if distutils_build:
|
||||
subprocess.run([sys.executable, str(code_gen), '-o', outdir,
|
||||
'--distutils-build', 'True'], check=True)
|
||||
else:
|
||||
subprocess.run([sys.executable, str(code_gen), '-o', outdir],
|
||||
check=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-o", "--outdir", type=str,
|
||||
help="Path to the output directory")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.outdir:
|
||||
# We're dealing with a distutils build here, write in-place:
|
||||
outdir_abs = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
|
||||
make_biasedurn(outdir_abs)
|
||||
|
||||
outdir_abs_boost = outdir_abs / '_boost' / 'src'
|
||||
if not os.path.exists(outdir_abs_boost):
|
||||
os.makedirs(outdir_abs_boost)
|
||||
make_boost(outdir_abs_boost, distutils_build=True)
|
||||
|
||||
outdir_abs_unuran = outdir_abs / '_unuran'
|
||||
make_unuran(outdir_abs_unuran, outdir_abs_unuran)
|
||||
else:
|
||||
# Meson build
|
||||
srcdir_abs = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
|
||||
outdir_abs = pathlib.Path(os.getcwd()) / args.outdir
|
||||
make_biasedurn(outdir_abs)
|
||||
make_boost(outdir_abs)
|
||||
make_unuran(srcdir_abs / '_unuran', outdir_abs)
|
||||
2006
.CondaPkg/env/Lib/site-packages/scipy/stats/_hypotests.py
vendored
Normal file
2006
.CondaPkg/env/Lib/site-packages/scipy/stats/_hypotests.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
725
.CondaPkg/env/Lib/site-packages/scipy/stats/_kde.py
vendored
Normal file
725
.CondaPkg/env/Lib/site-packages/scipy/stats/_kde.py
vendored
Normal file
@@ -0,0 +1,725 @@
|
||||
#-------------------------------------------------------------------------------
|
||||
#
|
||||
# Define classes for (uni/multi)-variate kernel density estimation.
|
||||
#
|
||||
# Currently, only Gaussian kernels are implemented.
|
||||
#
|
||||
# Written by: Robert Kern
|
||||
#
|
||||
# Date: 2004-08-09
|
||||
#
|
||||
# Modified: 2005-02-10 by Robert Kern.
|
||||
# Contributed to SciPy
|
||||
# 2005-10-07 by Robert Kern.
|
||||
# Some fixes to match the new scipy_core
|
||||
#
|
||||
# Copyright 2004-2005 by Enthought, Inc.
|
||||
#
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
# Standard library imports.
|
||||
import warnings
|
||||
|
||||
# SciPy imports.
|
||||
from scipy import linalg, special
|
||||
from scipy._lib._util import check_random_state
|
||||
|
||||
from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,
|
||||
sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
|
||||
ones, cov)
|
||||
import numpy as np
|
||||
|
||||
# Local imports.
|
||||
from . import _mvn
|
||||
from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log
|
||||
|
||||
|
||||
__all__ = ['gaussian_kde']
|
||||
|
||||
|
||||
class gaussian_kde:
|
||||
"""Representation of a kernel-density estimate using Gaussian kernels.
|
||||
|
||||
Kernel density estimation is a way to estimate the probability density
|
||||
function (PDF) of a random variable in a non-parametric way.
|
||||
`gaussian_kde` works for both uni-variate and multi-variate data. It
|
||||
includes automatic bandwidth determination. The estimation works best for
|
||||
a unimodal distribution; bimodal or multi-modal distributions tend to be
|
||||
oversmoothed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : array_like
|
||||
Datapoints to estimate from. In case of univariate data this is a 1-D
|
||||
array, otherwise a 2-D array with shape (# of dims, # of data).
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the estimator bandwidth. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a scalar,
|
||||
this will be used directly as `kde.factor`. If a callable, it should
|
||||
take a `gaussian_kde` instance as only parameter and return a scalar.
|
||||
If None (default), 'scott' is used. See Notes for more details.
|
||||
weights : array_like, optional
|
||||
weights of datapoints. This must be the same shape as dataset.
|
||||
If None (default), the samples are assumed to be equally weighted
|
||||
|
||||
Attributes
|
||||
----------
|
||||
dataset : ndarray
|
||||
The dataset with which `gaussian_kde` was initialized.
|
||||
d : int
|
||||
Number of dimensions.
|
||||
n : int
|
||||
Number of datapoints.
|
||||
neff : int
|
||||
Effective number of datapoints.
|
||||
|
||||
.. versionadded:: 1.2.0
|
||||
factor : float
|
||||
The bandwidth factor, obtained from `kde.covariance_factor`. The square
|
||||
of `kde.factor` multiplies the covariance matrix of the data in the kde
|
||||
estimation.
|
||||
covariance : ndarray
|
||||
The covariance matrix of `dataset`, scaled by the calculated bandwidth
|
||||
(`kde.factor`).
|
||||
inv_cov : ndarray
|
||||
The inverse of `covariance`.
|
||||
|
||||
Methods
|
||||
-------
|
||||
evaluate
|
||||
__call__
|
||||
integrate_gaussian
|
||||
integrate_box_1d
|
||||
integrate_box
|
||||
integrate_kde
|
||||
pdf
|
||||
logpdf
|
||||
resample
|
||||
set_bandwidth
|
||||
covariance_factor
|
||||
|
||||
Notes
|
||||
-----
|
||||
Bandwidth selection strongly influences the estimate obtained from the KDE
|
||||
(much more so than the actual shape of the kernel). Bandwidth selection
|
||||
can be done by a "rule of thumb", by cross-validation, by "plug-in
|
||||
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
|
||||
uses a rule of thumb, the default is Scott's Rule.
|
||||
|
||||
Scott's Rule [1]_, implemented as `scotts_factor`, is::
|
||||
|
||||
n**(-1./(d+4)),
|
||||
|
||||
with ``n`` the number of data points and ``d`` the number of dimensions.
|
||||
In the case of unequally weighted points, `scotts_factor` becomes::
|
||||
|
||||
neff**(-1./(d+4)),
|
||||
|
||||
with ``neff`` the effective number of datapoints.
|
||||
Silverman's Rule [2]_, implemented as `silverman_factor`, is::
|
||||
|
||||
(n * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
or in the case of unequally weighted points::
|
||||
|
||||
(neff * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
Good general descriptions of kernel density estimation can be found in [1]_
|
||||
and [2]_, the mathematics for this multi-dimensional implementation can be
|
||||
found in [1]_.
|
||||
|
||||
With a set of weighted samples, the effective number of datapoints ``neff``
|
||||
is defined by::
|
||||
|
||||
neff = sum(weights)^2 / sum(weights^2)
|
||||
|
||||
as detailed in [5]_.
|
||||
|
||||
`gaussian_kde` does not currently support data that lies in a
|
||||
lower-dimensional subspace of the space in which it is expressed. For such
|
||||
data, consider performing principle component analysis / dimensionality
|
||||
reduction and using `gaussian_kde` with the transformed data.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
|
||||
Visualization", John Wiley & Sons, New York, Chicester, 1992.
|
||||
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
|
||||
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
|
||||
Chapman and Hall, London, 1986.
|
||||
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
|
||||
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
|
||||
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
|
||||
conditional density estimation", Computational Statistics & Data
|
||||
Analysis, Vol. 36, pp. 279-298, 2001.
|
||||
.. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
|
||||
Series A (General), 132, 272
|
||||
|
||||
Examples
|
||||
--------
|
||||
Generate some random two-dimensional data:
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> def measure(n):
|
||||
... "Measurement model, return two coupled measurements."
|
||||
... m1 = np.random.normal(size=n)
|
||||
... m2 = np.random.normal(scale=0.5, size=n)
|
||||
... return m1+m2, m1-m2
|
||||
|
||||
>>> m1, m2 = measure(2000)
|
||||
>>> xmin = m1.min()
|
||||
>>> xmax = m1.max()
|
||||
>>> ymin = m2.min()
|
||||
>>> ymax = m2.max()
|
||||
|
||||
Perform a kernel density estimate on the data:
|
||||
|
||||
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
||||
>>> positions = np.vstack([X.ravel(), Y.ravel()])
|
||||
>>> values = np.vstack([m1, m2])
|
||||
>>> kernel = stats.gaussian_kde(values)
|
||||
>>> Z = np.reshape(kernel(positions).T, X.shape)
|
||||
|
||||
Plot the results:
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
|
||||
... extent=[xmin, xmax, ymin, ymax])
|
||||
>>> ax.plot(m1, m2, 'k.', markersize=2)
|
||||
>>> ax.set_xlim([xmin, xmax])
|
||||
>>> ax.set_ylim([ymin, ymax])
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
def __init__(self, dataset, bw_method=None, weights=None):
|
||||
self.dataset = atleast_2d(asarray(dataset))
|
||||
if not self.dataset.size > 1:
|
||||
raise ValueError("`dataset` input should have multiple elements.")
|
||||
|
||||
self.d, self.n = self.dataset.shape
|
||||
|
||||
if weights is not None:
|
||||
self._weights = atleast_1d(weights).astype(float)
|
||||
self._weights /= sum(self._weights)
|
||||
if self.weights.ndim != 1:
|
||||
raise ValueError("`weights` input should be one-dimensional.")
|
||||
if len(self._weights) != self.n:
|
||||
raise ValueError("`weights` input should be of length n")
|
||||
self._neff = 1/sum(self._weights**2)
|
||||
|
||||
# This can be converted to a warning once gh-10205 is resolved
|
||||
if self.d > self.n:
|
||||
msg = ("Number of dimensions is greater than number of samples. "
|
||||
"This results in a singular data covariance matrix, which "
|
||||
"cannot be treated using the algorithms implemented in "
|
||||
"`gaussian_kde`. Note that `gaussian_kde` interprets each "
|
||||
"*column* of `dataset` to be a point; consider transposing "
|
||||
"the input to `dataset`.")
|
||||
raise ValueError(msg)
|
||||
|
||||
try:
|
||||
self.set_bandwidth(bw_method=bw_method)
|
||||
except linalg.LinAlgError as e:
|
||||
msg = ("The data appears to lie in a lower-dimensional subspace "
|
||||
"of the space in which it is expressed. This has resulted "
|
||||
"in a singular data covariance matrix, which cannot be "
|
||||
"treated using the algorithms implemented in "
|
||||
"`gaussian_kde`. Consider performing principle component "
|
||||
"analysis / dimensionality reduction and using "
|
||||
"`gaussian_kde` with the transformed data.")
|
||||
raise linalg.LinAlgError(msg) from e
|
||||
|
||||
def evaluate(self, points):
|
||||
"""Evaluate the estimated pdf on a set of points.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
points : (# of dimensions, # of points)-array
|
||||
Alternatively, a (# of dimensions,) vector can be passed in and
|
||||
treated as a single point.
|
||||
|
||||
Returns
|
||||
-------
|
||||
values : (# of points,)-array
|
||||
The values at each point.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError : if the dimensionality of the input points is different than
|
||||
the dimensionality of the KDE.
|
||||
|
||||
"""
|
||||
points = atleast_2d(asarray(points))
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = "points have dimension %s, dataset has dimension %s" % (d,
|
||||
self.d)
|
||||
raise ValueError(msg)
|
||||
|
||||
output_dtype, spec = _get_output_dtype(self.covariance, points)
|
||||
result = gaussian_kernel_estimate[spec](
|
||||
self.dataset.T, self.weights[:, None],
|
||||
points.T, self.cho_cov, output_dtype)
|
||||
|
||||
return result[:, 0]
|
||||
|
||||
__call__ = evaluate
|
||||
|
||||
def integrate_gaussian(self, mean, cov):
|
||||
"""
|
||||
Multiply estimated density by a multivariate Gaussian and integrate
|
||||
over the whole space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mean : aray_like
|
||||
A 1-D array, specifying the mean of the Gaussian.
|
||||
cov : array_like
|
||||
A 2-D array, specifying the covariance matrix of the Gaussian.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : scalar
|
||||
The value of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the mean or covariance of the input Gaussian differs from
|
||||
the KDE's dimensionality.
|
||||
|
||||
"""
|
||||
mean = atleast_1d(squeeze(mean))
|
||||
cov = atleast_2d(cov)
|
||||
|
||||
if mean.shape != (self.d,):
|
||||
raise ValueError("mean does not have dimension %s" % self.d)
|
||||
if cov.shape != (self.d, self.d):
|
||||
raise ValueError("covariance does not have dimension %s" % self.d)
|
||||
|
||||
# make mean a column vector
|
||||
mean = mean[:, newaxis]
|
||||
|
||||
sum_cov = self.covariance + cov
|
||||
|
||||
# This will raise LinAlgError if the new cov matrix is not s.p.d
|
||||
# cho_factor returns (ndarray, bool) where bool is a flag for whether
|
||||
# or not ndarray is upper or lower triangular
|
||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
||||
|
||||
diff = self.dataset - mean
|
||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
||||
|
||||
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
|
||||
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
|
||||
|
||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||
result = sum(exp(-energies)*self.weights, axis=0) / norm_const
|
||||
|
||||
return result
|
||||
|
||||
def integrate_box_1d(self, low, high):
|
||||
"""
|
||||
Computes the integral of a 1D pdf between two bounds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low : scalar
|
||||
Lower bound of integration.
|
||||
high : scalar
|
||||
Upper bound of integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDE is over more than one dimension.
|
||||
|
||||
"""
|
||||
if self.d != 1:
|
||||
raise ValueError("integrate_box_1d() only handles 1D pdfs")
|
||||
|
||||
stdev = ravel(sqrt(self.covariance))[0]
|
||||
|
||||
normalized_low = ravel((low - self.dataset) / stdev)
|
||||
normalized_high = ravel((high - self.dataset) / stdev)
|
||||
|
||||
value = np.sum(self.weights*(
|
||||
special.ndtr(normalized_high) -
|
||||
special.ndtr(normalized_low)))
|
||||
return value
|
||||
|
||||
def integrate_box(self, low_bounds, high_bounds, maxpts=None):
|
||||
"""Computes the integral of a pdf over a rectangular interval.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low_bounds : array_like
|
||||
A 1-D array containing the lower bounds of integration.
|
||||
high_bounds : array_like
|
||||
A 1-D array containing the upper bounds of integration.
|
||||
maxpts : int, optional
|
||||
The maximum number of points to use for integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
"""
|
||||
if maxpts is not None:
|
||||
extra_kwds = {'maxpts': maxpts}
|
||||
else:
|
||||
extra_kwds = {}
|
||||
|
||||
value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,
|
||||
self.dataset, self.weights,
|
||||
self.covariance, **extra_kwds)
|
||||
if inform:
|
||||
msg = ('An integral in _mvn.mvnun requires more points than %s' %
|
||||
(self.d * 1000))
|
||||
warnings.warn(msg)
|
||||
|
||||
return value
|
||||
|
||||
def integrate_kde(self, other):
|
||||
"""
|
||||
Computes the integral of the product of this kernel density estimate
|
||||
with another.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : gaussian_kde instance
|
||||
The other kde.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDEs have different dimensionality.
|
||||
|
||||
"""
|
||||
if other.d != self.d:
|
||||
raise ValueError("KDEs are not the same dimensionality")
|
||||
|
||||
# we want to iterate over the smallest number of points
|
||||
if other.n < self.n:
|
||||
small = other
|
||||
large = self
|
||||
else:
|
||||
small = self
|
||||
large = other
|
||||
|
||||
sum_cov = small.covariance + large.covariance
|
||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
||||
result = 0.0
|
||||
for i in range(small.n):
|
||||
mean = small.dataset[:, i, newaxis]
|
||||
diff = large.dataset - mean
|
||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
||||
|
||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||
result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
|
||||
|
||||
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
|
||||
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
|
||||
|
||||
result /= norm_const
|
||||
|
||||
return result
|
||||
|
||||
def resample(self, size=None, seed=None):
|
||||
"""Randomly sample a dataset from the estimated pdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, optional
|
||||
The number of samples to draw. If not provided, then the size is
|
||||
the same as the effective number of samples in the underlying
|
||||
dataset.
|
||||
seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
|
||||
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
resample : (self.d, `size`) ndarray
|
||||
The sampled dataset.
|
||||
|
||||
"""
|
||||
if size is None:
|
||||
size = int(self.neff)
|
||||
|
||||
random_state = check_random_state(seed)
|
||||
norm = transpose(random_state.multivariate_normal(
|
||||
zeros((self.d,), float), self.covariance, size=size
|
||||
))
|
||||
indices = random_state.choice(self.n, size=size, p=self.weights)
|
||||
means = self.dataset[:, indices]
|
||||
|
||||
return means + norm
|
||||
|
||||
def scotts_factor(self):
|
||||
"""Compute Scott's factor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : float
|
||||
Scott's factor.
|
||||
"""
|
||||
return power(self.neff, -1./(self.d+4))
|
||||
|
||||
def silverman_factor(self):
|
||||
"""Compute the Silverman factor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : float
|
||||
The silverman factor.
|
||||
"""
|
||||
return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
|
||||
|
||||
# Default method to calculate bandwidth, can be overwritten by subclass
|
||||
covariance_factor = scotts_factor
|
||||
covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
|
||||
multiplies the data covariance matrix to obtain the kernel covariance
|
||||
matrix. The default is `scotts_factor`. A subclass can overwrite this
|
||||
method to provide a different method, or set it through a call to
|
||||
`kde.set_bandwidth`."""
|
||||
|
||||
def set_bandwidth(self, bw_method=None):
|
||||
"""Compute the estimator bandwidth with given method.
|
||||
|
||||
The new bandwidth calculated after a call to `set_bandwidth` is used
|
||||
for subsequent evaluations of the estimated density.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the estimator bandwidth. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a
|
||||
scalar, this will be used directly as `kde.factor`. If a callable,
|
||||
it should take a `gaussian_kde` instance as only parameter and
|
||||
return a scalar. If None (default), nothing happens; the current
|
||||
`kde.covariance_factor` method is kept.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 0.11
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> import scipy.stats as stats
|
||||
>>> x1 = np.array([-7, -5, 1, 4, 5.])
|
||||
>>> kde = stats.gaussian_kde(x1)
|
||||
>>> xs = np.linspace(-10, 10, num=50)
|
||||
>>> y1 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method='silverman')
|
||||
>>> y2 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
|
||||
>>> y3 = kde(xs)
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
|
||||
... label='Data points (rescaled)')
|
||||
>>> ax.plot(xs, y1, label='Scott (default)')
|
||||
>>> ax.plot(xs, y2, label='Silverman')
|
||||
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
|
||||
>>> ax.legend()
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if bw_method is None:
|
||||
pass
|
||||
elif bw_method == 'scott':
|
||||
self.covariance_factor = self.scotts_factor
|
||||
elif bw_method == 'silverman':
|
||||
self.covariance_factor = self.silverman_factor
|
||||
elif np.isscalar(bw_method) and not isinstance(bw_method, str):
|
||||
self._bw_method = 'use constant'
|
||||
self.covariance_factor = lambda: bw_method
|
||||
elif callable(bw_method):
|
||||
self._bw_method = bw_method
|
||||
self.covariance_factor = lambda: self._bw_method(self)
|
||||
else:
|
||||
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
|
||||
"or a callable."
|
||||
raise ValueError(msg)
|
||||
|
||||
self._compute_covariance()
|
||||
|
||||
def _compute_covariance(self):
|
||||
"""Computes the covariance matrix for each Gaussian kernel using
|
||||
covariance_factor().
|
||||
"""
|
||||
self.factor = self.covariance_factor()
|
||||
# Cache covariance and Cholesky decomp of covariance
|
||||
if not hasattr(self, '_data_cho_cov'):
|
||||
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
|
||||
bias=False,
|
||||
aweights=self.weights))
|
||||
self._data_cho_cov = linalg.cholesky(self._data_covariance,
|
||||
lower=True)
|
||||
|
||||
self.covariance = self._data_covariance * self.factor**2
|
||||
self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
|
||||
self.log_det = 2*np.log(np.diag(self.cho_cov
|
||||
* np.sqrt(2*pi))).sum()
|
||||
|
||||
@property
|
||||
def inv_cov(self):
|
||||
# Re-compute from scratch each time because I'm not sure how this is
|
||||
# used in the wild. (Perhaps users change the `dataset`, since it's
|
||||
# not a private attribute?) `_compute_covariance` used to recalculate
|
||||
# all these, so we'll recalculate everything now that this is a
|
||||
# a property.
|
||||
self.factor = self.covariance_factor()
|
||||
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
|
||||
bias=False, aweights=self.weights))
|
||||
return linalg.inv(self._data_covariance) / self.factor**2
|
||||
|
||||
def pdf(self, x):
|
||||
"""
|
||||
Evaluate the estimated pdf on a provided set of points.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
|
||||
docstring for more details.
|
||||
|
||||
"""
|
||||
return self.evaluate(x)
|
||||
|
||||
def logpdf(self, x):
|
||||
"""
|
||||
Evaluate the log of the estimated pdf on a provided set of points.
|
||||
"""
|
||||
points = atleast_2d(x)
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = (f"points have dimension {d}, "
|
||||
f"dataset has dimension {self.d}")
|
||||
raise ValueError(msg)
|
||||
|
||||
output_dtype, spec = _get_output_dtype(self.covariance, points)
|
||||
result = gaussian_kernel_estimate_log[spec](
|
||||
self.dataset.T, self.weights[:, None],
|
||||
points.T, self.cho_cov, output_dtype)
|
||||
|
||||
return result[:, 0]
|
||||
|
||||
def marginal(self, dimensions):
|
||||
"""Return a marginal KDE distribution
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dimensions : int or 1-d array_like
|
||||
The dimensions of the multivariate distribution corresponding
|
||||
with the marginal variables, that is, the indices of the dimensions
|
||||
that are being retained. The other dimensions are marginalized out.
|
||||
|
||||
Returns
|
||||
-------
|
||||
marginal_kde : gaussian_kde
|
||||
An object representing the marginal distribution.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.10.0
|
||||
|
||||
"""
|
||||
|
||||
dims = np.atleast_1d(dimensions)
|
||||
|
||||
if not np.issubdtype(dims.dtype, np.integer):
|
||||
msg = ("Elements of `dimensions` must be integers - the indices "
|
||||
"of the marginal variables being retained.")
|
||||
raise ValueError(msg)
|
||||
|
||||
n = len(self.dataset) # number of dimensions
|
||||
original_dims = dims.copy()
|
||||
|
||||
dims[dims < 0] = n + dims[dims < 0]
|
||||
|
||||
if len(np.unique(dims)) != len(dims):
|
||||
msg = ("All elements of `dimensions` must be unique.")
|
||||
raise ValueError(msg)
|
||||
|
||||
i_invalid = (dims < 0) | (dims >= n)
|
||||
if np.any(i_invalid):
|
||||
msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
|
||||
f"for a distribution in {n} dimensions.")
|
||||
raise ValueError(msg)
|
||||
|
||||
dataset = self.dataset[dims]
|
||||
weights = self.weights
|
||||
|
||||
return gaussian_kde(dataset, bw_method=self.covariance_factor(),
|
||||
weights=weights)
|
||||
|
||||
@property
|
||||
def weights(self):
|
||||
try:
|
||||
return self._weights
|
||||
except AttributeError:
|
||||
self._weights = ones(self.n)/self.n
|
||||
return self._weights
|
||||
|
||||
@property
|
||||
def neff(self):
|
||||
try:
|
||||
return self._neff
|
||||
except AttributeError:
|
||||
self._neff = 1/sum(self.weights**2)
|
||||
return self._neff
|
||||
|
||||
|
||||
def _get_output_dtype(covariance, points):
|
||||
"""
|
||||
Calculates the output dtype and the "spec" (=C type name).
|
||||
|
||||
This was necessary in order to deal with the fused types in the Cython
|
||||
routine `gaussian_kernel_estimate`. See gh-10824 for details.
|
||||
"""
|
||||
output_dtype = np.common_type(covariance, points)
|
||||
itemsize = np.dtype(output_dtype).itemsize
|
||||
if itemsize == 4:
|
||||
spec = 'float'
|
||||
elif itemsize == 8:
|
||||
spec = 'double'
|
||||
elif itemsize in (12, 16):
|
||||
spec = 'long double'
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{output_dtype} has unexpected item size: {itemsize}"
|
||||
)
|
||||
|
||||
return output_dtype, spec
|
||||
596
.CondaPkg/env/Lib/site-packages/scipy/stats/_ksstats.py
vendored
Normal file
596
.CondaPkg/env/Lib/site-packages/scipy/stats/_ksstats.py
vendored
Normal file
@@ -0,0 +1,596 @@
|
||||
# Compute the two-sided one-sample Kolmogorov-Smirnov Prob(Dn <= d) where:
|
||||
# D_n = sup_x{|F_n(x) - F(x)|},
|
||||
# F_n(x) is the empirical CDF for a sample of size n {x_i: i=1,...,n},
|
||||
# F(x) is the CDF of a probability distribution.
|
||||
#
|
||||
# Exact methods:
|
||||
# Prob(D_n >= d) can be computed via a matrix algorithm of Durbin[1]
|
||||
# or a recursion algorithm due to Pomeranz[2].
|
||||
# Marsaglia, Tsang & Wang[3] gave a computation-efficient way to perform
|
||||
# the Durbin algorithm.
|
||||
# D_n >= d <==> D_n+ >= d or D_n- >= d (the one-sided K-S statistics), hence
|
||||
# Prob(D_n >= d) = 2*Prob(D_n+ >= d) - Prob(D_n+ >= d and D_n- >= d).
|
||||
# For d > 0.5, the latter intersection probability is 0.
|
||||
#
|
||||
# Approximate methods:
|
||||
# For d close to 0.5, ignoring that intersection term may still give a
|
||||
# reasonable approximation.
|
||||
# Li-Chien[4] and Korolyuk[5] gave an asymptotic formula extending
|
||||
# Kolmogorov's initial asymptotic, suitable for large d. (See
|
||||
# scipy.special.kolmogorov for that asymptotic)
|
||||
# Pelz-Good[6] used the functional equation for Jacobi theta functions to
|
||||
# transform the Li-Chien/Korolyuk formula produce a computational formula
|
||||
# suitable for small d.
|
||||
#
|
||||
# Simard and L'Ecuyer[7] provided an algorithm to decide when to use each of
|
||||
# the above approaches and it is that which is used here.
|
||||
#
|
||||
# Other approaches:
|
||||
# Carvalho[8] optimizes Durbin's matrix algorithm for large values of d.
|
||||
# Moscovich and Nadler[9] use FFTs to compute the convolutions.
|
||||
|
||||
# References:
|
||||
# [1] Durbin J (1968).
|
||||
# "The Probability that the Sample Distribution Function Lies Between Two
|
||||
# Parallel Straight Lines."
|
||||
# Annals of Mathematical Statistics, 39, 398-411.
|
||||
# [2] Pomeranz J (1974).
|
||||
# "Exact Cumulative Distribution of the Kolmogorov-Smirnov Statistic for
|
||||
# Small Samples (Algorithm 487)."
|
||||
# Communications of the ACM, 17(12), 703-704.
|
||||
# [3] Marsaglia G, Tsang WW, Wang J (2003).
|
||||
# "Evaluating Kolmogorov's Distribution."
|
||||
# Journal of Statistical Software, 8(18), 1-4.
|
||||
# [4] LI-CHIEN, C. (1956).
|
||||
# "On the exact distribution of the statistics of A. N. Kolmogorov and
|
||||
# their asymptotic expansion."
|
||||
# Acta Matematica Sinica, 6, 55-81.
|
||||
# [5] KOROLYUK, V. S. (1960).
|
||||
# "Asymptotic analysis of the distribution of the maximum deviation in
|
||||
# the Bernoulli scheme."
|
||||
# Theor. Probability Appl., 4, 339-366.
|
||||
# [6] Pelz W, Good IJ (1976).
|
||||
# "Approximating the Lower Tail-areas of the Kolmogorov-Smirnov One-sample
|
||||
# Statistic."
|
||||
# Journal of the Royal Statistical Society, Series B, 38(2), 152-156.
|
||||
# [7] Simard, R., L'Ecuyer, P. (2011)
|
||||
# "Computing the Two-Sided Kolmogorov-Smirnov Distribution",
|
||||
# Journal of Statistical Software, Vol 39, 11, 1-18.
|
||||
# [8] Carvalho, Luis (2015)
|
||||
# "An Improved Evaluation of Kolmogorov's Distribution"
|
||||
# Journal of Statistical Software, Code Snippets; Vol 65(3), 1-8.
|
||||
# [9] Amit Moscovich, Boaz Nadler (2017)
|
||||
# "Fast calculation of boundary crossing probabilities for Poisson
|
||||
# processes",
|
||||
# Statistics & Probability Letters, Vol 123, 177-182.
|
||||
|
||||
|
||||
import numpy as np
|
||||
import scipy.special
|
||||
import scipy.special._ufuncs as scu
|
||||
from scipy._lib._finite_differences import _derivative
|
||||
|
||||
_E128 = 128
|
||||
_EP128 = np.ldexp(np.longdouble(1), _E128)
|
||||
_EM128 = np.ldexp(np.longdouble(1), -_E128)
|
||||
|
||||
_SQRT2PI = np.sqrt(2 * np.pi)
|
||||
_LOG_2PI = np.log(2 * np.pi)
|
||||
_MIN_LOG = -708
|
||||
_SQRT3 = np.sqrt(3)
|
||||
_PI_SQUARED = np.pi ** 2
|
||||
_PI_FOUR = np.pi ** 4
|
||||
_PI_SIX = np.pi ** 6
|
||||
|
||||
# [Lifted from _loggamma.pxd.] If B_m are the Bernoulli numbers,
|
||||
# then Stirling coeffs are B_{2j}/(2j)/(2j-1) for j=8,...1.
|
||||
_STIRLING_COEFFS = [-2.955065359477124183e-2, 6.4102564102564102564e-3,
|
||||
-1.9175269175269175269e-3, 8.4175084175084175084e-4,
|
||||
-5.952380952380952381e-4, 7.9365079365079365079e-4,
|
||||
-2.7777777777777777778e-3, 8.3333333333333333333e-2]
|
||||
|
||||
def _log_nfactorial_div_n_pow_n(n):
|
||||
# Computes n! / n**n
|
||||
# = (n-1)! / n**(n-1)
|
||||
# Uses Stirling's approximation, but removes n*log(n) up-front to
|
||||
# avoid subtractive cancellation.
|
||||
# = log(n)/2 - n + log(sqrt(2pi)) + sum B_{2j}/(2j)/(2j-1)/n**(2j-1)
|
||||
rn = 1.0/n
|
||||
return np.log(n)/2 - n + _LOG_2PI/2 + rn * np.polyval(_STIRLING_COEFFS, rn/n)
|
||||
|
||||
|
||||
def _clip_prob(p):
|
||||
"""clips a probability to range 0<=p<=1."""
|
||||
return np.clip(p, 0.0, 1.0)
|
||||
|
||||
|
||||
def _select_and_clip_prob(cdfprob, sfprob, cdf=True):
|
||||
"""Selects either the CDF or SF, and then clips to range 0<=p<=1."""
|
||||
p = np.where(cdf, cdfprob, sfprob)
|
||||
return _clip_prob(p)
|
||||
|
||||
|
||||
def _kolmogn_DMTW(n, d, cdf=True):
|
||||
r"""Computes the Kolmogorov CDF: Pr(D_n <= d) using the MTW approach to
|
||||
the Durbin matrix algorithm.
|
||||
|
||||
Durbin (1968); Marsaglia, Tsang, Wang (2003). [1], [3].
|
||||
"""
|
||||
# Write d = (k-h)/n, where k is positive integer and 0 <= h < 1
|
||||
# Generate initial matrix H of size m*m where m=(2k-1)
|
||||
# Compute k-th row of (n!/n^n) * H^n, scaling intermediate results.
|
||||
# Requires memory O(m^2) and computation O(m^2 log(n)).
|
||||
# Most suitable for small m.
|
||||
|
||||
if d >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf)
|
||||
nd = n * d
|
||||
if nd <= 0.5:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf)
|
||||
k = int(np.ceil(nd))
|
||||
h = k - nd
|
||||
m = 2 * k - 1
|
||||
|
||||
H = np.zeros([m, m])
|
||||
|
||||
# Initialize: v is first column (and last row) of H
|
||||
# v[j] = (1-h^(j+1)/(j+1)! (except for v[-1])
|
||||
# w[j] = 1/(j)!
|
||||
# q = k-th row of H (actually i!/n^i*H^i)
|
||||
intm = np.arange(1, m + 1)
|
||||
v = 1.0 - h ** intm
|
||||
w = np.empty(m)
|
||||
fac = 1.0
|
||||
for j in intm:
|
||||
w[j - 1] = fac
|
||||
fac /= j # This might underflow. Isn't a problem.
|
||||
v[j - 1] *= fac
|
||||
tt = max(2 * h - 1.0, 0)**m - 2*h**m
|
||||
v[-1] = (1.0 + tt) * fac
|
||||
|
||||
for i in range(1, m):
|
||||
H[i - 1:, i] = w[:m - i + 1]
|
||||
H[:, 0] = v
|
||||
H[-1, :] = np.flip(v, axis=0)
|
||||
|
||||
Hpwr = np.eye(np.shape(H)[0]) # Holds intermediate powers of H
|
||||
nn = n
|
||||
expnt = 0 # Scaling of Hpwr
|
||||
Hexpnt = 0 # Scaling of H
|
||||
while nn > 0:
|
||||
if nn % 2:
|
||||
Hpwr = np.matmul(Hpwr, H)
|
||||
expnt += Hexpnt
|
||||
H = np.matmul(H, H)
|
||||
Hexpnt *= 2
|
||||
# Scale as needed.
|
||||
if np.abs(H[k - 1, k - 1]) > _EP128:
|
||||
H /= _EP128
|
||||
Hexpnt += _E128
|
||||
nn = nn // 2
|
||||
|
||||
p = Hpwr[k - 1, k - 1]
|
||||
|
||||
# Multiply by n!/n^n
|
||||
for i in range(1, n + 1):
|
||||
p = i * p / n
|
||||
if np.abs(p) < _EM128:
|
||||
p *= _EP128
|
||||
expnt -= _E128
|
||||
|
||||
# unscale
|
||||
if expnt != 0:
|
||||
p = np.ldexp(p, expnt)
|
||||
|
||||
return _select_and_clip_prob(p, 1.0-p, cdf)
|
||||
|
||||
|
||||
def _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf):
|
||||
"""Compute the endpoints of the interval for row i."""
|
||||
if i == 0:
|
||||
j1, j2 = -ll - ceilf - 1, ll + ceilf - 1
|
||||
else:
|
||||
# i + 1 = 2*ip1div2 + ip1mod2
|
||||
ip1div2, ip1mod2 = divmod(i + 1, 2)
|
||||
if ip1mod2 == 0: # i is odd
|
||||
if ip1div2 == n + 1:
|
||||
j1, j2 = n - ll - ceilf - 1, n + ll + ceilf - 1
|
||||
else:
|
||||
j1, j2 = ip1div2 - 1 - ll - roundf - 1, ip1div2 + ll - 1 + ceilf - 1
|
||||
else:
|
||||
j1, j2 = ip1div2 - 1 - ll - 1, ip1div2 + ll + roundf - 1
|
||||
|
||||
return max(j1 + 2, 0), min(j2, n)
|
||||
|
||||
|
||||
def _kolmogn_Pomeranz(n, x, cdf=True):
|
||||
r"""Computes Pr(D_n <= d) using the Pomeranz recursion algorithm.
|
||||
|
||||
Pomeranz (1974) [2]
|
||||
"""
|
||||
|
||||
# V is n*(2n+2) matrix.
|
||||
# Each row is convolution of the previous row and probabilities from a
|
||||
# Poisson distribution.
|
||||
# Desired CDF probability is n! V[n-1, 2n+1] (final entry in final row).
|
||||
# Only two rows are needed at any given stage:
|
||||
# - Call them V0 and V1.
|
||||
# - Swap each iteration
|
||||
# Only a few (contiguous) entries in each row can be non-zero.
|
||||
# - Keep track of start and end (j1 and j2 below)
|
||||
# - V0s and V1s track the start in the two rows
|
||||
# Scale intermediate results as needed.
|
||||
# Only a few different Poisson distributions can occur
|
||||
t = n * x
|
||||
ll = int(np.floor(t))
|
||||
f = 1.0 * (t - ll) # fractional part of t
|
||||
g = min(f, 1.0 - f)
|
||||
ceilf = (1 if f > 0 else 0)
|
||||
roundf = (1 if f > 0.5 else 0)
|
||||
npwrs = 2 * (ll + 1) # Maximum number of powers needed in convolutions
|
||||
gpower = np.empty(npwrs) # gpower = (g/n)^m/m!
|
||||
twogpower = np.empty(npwrs) # twogpower = (2g/n)^m/m!
|
||||
onem2gpower = np.empty(npwrs) # onem2gpower = ((1-2g)/n)^m/m!
|
||||
# gpower etc are *almost* Poisson probs, just missing normalizing factor.
|
||||
|
||||
gpower[0] = 1.0
|
||||
twogpower[0] = 1.0
|
||||
onem2gpower[0] = 1.0
|
||||
expnt = 0
|
||||
g_over_n, two_g_over_n, one_minus_two_g_over_n = g/n, 2*g/n, (1 - 2*g)/n
|
||||
for m in range(1, npwrs):
|
||||
gpower[m] = gpower[m - 1] * g_over_n / m
|
||||
twogpower[m] = twogpower[m - 1] * two_g_over_n / m
|
||||
onem2gpower[m] = onem2gpower[m - 1] * one_minus_two_g_over_n / m
|
||||
|
||||
V0 = np.zeros([npwrs])
|
||||
V1 = np.zeros([npwrs])
|
||||
V1[0] = 1 # first row
|
||||
V0s, V1s = 0, 0 # start indices of the two rows
|
||||
|
||||
j1, j2 = _pomeranz_compute_j1j2(0, n, ll, ceilf, roundf)
|
||||
for i in range(1, 2 * n + 2):
|
||||
# Preserve j1, V1, V1s, V0s from last iteration
|
||||
k1 = j1
|
||||
V0, V1 = V1, V0
|
||||
V0s, V1s = V1s, V0s
|
||||
V1.fill(0.0)
|
||||
j1, j2 = _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf)
|
||||
if i == 1 or i == 2 * n + 1:
|
||||
pwrs = gpower
|
||||
else:
|
||||
pwrs = (twogpower if i % 2 else onem2gpower)
|
||||
ln2 = j2 - k1 + 1
|
||||
if ln2 > 0:
|
||||
conv = np.convolve(V0[k1 - V0s:k1 - V0s + ln2], pwrs[:ln2])
|
||||
conv_start = j1 - k1 # First index to use from conv
|
||||
conv_len = j2 - j1 + 1 # Number of entries to use from conv
|
||||
V1[:conv_len] = conv[conv_start:conv_start + conv_len]
|
||||
# Scale to avoid underflow.
|
||||
if 0 < np.max(V1) < _EM128:
|
||||
V1 *= _EP128
|
||||
expnt -= _E128
|
||||
V1s = V0s + j1 - k1
|
||||
|
||||
# multiply by n!
|
||||
ans = V1[n - V1s]
|
||||
for m in range(1, n + 1):
|
||||
if np.abs(ans) > _EP128:
|
||||
ans *= _EM128
|
||||
expnt += _E128
|
||||
ans *= m
|
||||
|
||||
# Undo any intermediate scaling
|
||||
if expnt != 0:
|
||||
ans = np.ldexp(ans, expnt)
|
||||
ans = _select_and_clip_prob(ans, 1.0 - ans, cdf)
|
||||
return ans
|
||||
|
||||
|
||||
def _kolmogn_PelzGood(n, x, cdf=True):
|
||||
"""Computes the Pelz-Good approximation to Prob(Dn <= x) with 0<=x<=1.
|
||||
|
||||
Start with Li-Chien, Korolyuk approximation:
|
||||
Prob(Dn <= x) ~ K0(z) + K1(z)/sqrt(n) + K2(z)/n + K3(z)/n**1.5
|
||||
where z = x*sqrt(n).
|
||||
Transform each K_(z) using Jacobi theta functions into a form suitable
|
||||
for small z.
|
||||
Pelz-Good (1976). [6]
|
||||
"""
|
||||
if x <= 0.0:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
if x >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
|
||||
|
||||
z = np.sqrt(n) * x
|
||||
zsquared, zthree, zfour, zsix = z**2, z**3, z**4, z**6
|
||||
|
||||
qlog = -_PI_SQUARED / 8 / zsquared
|
||||
if qlog < _MIN_LOG: # z ~ 0.041743441416853426
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
|
||||
q = np.exp(qlog)
|
||||
|
||||
# Coefficients of terms in the sums for K1, K2 and K3
|
||||
k1a = -zsquared
|
||||
k1b = _PI_SQUARED / 4
|
||||
|
||||
k2a = 6 * zsix + 2 * zfour
|
||||
k2b = (2 * zfour - 5 * zsquared) * _PI_SQUARED / 4
|
||||
k2c = _PI_FOUR * (1 - 2 * zsquared) / 16
|
||||
|
||||
k3d = _PI_SIX * (5 - 30 * zsquared) / 64
|
||||
k3c = _PI_FOUR * (-60 * zsquared + 212 * zfour) / 16
|
||||
k3b = _PI_SQUARED * (135 * zfour - 96 * zsix) / 4
|
||||
k3a = -30 * zsix - 90 * z**8
|
||||
|
||||
K0to3 = np.zeros(4)
|
||||
# Use a Horner scheme to evaluate sum c_i q^(i^2)
|
||||
# Reduces to a sum over odd integers.
|
||||
maxk = int(np.ceil(16 * z / np.pi))
|
||||
for k in range(maxk, 0, -1):
|
||||
m = 2 * k - 1
|
||||
msquared, mfour, msix = m**2, m**4, m**6
|
||||
qpower = np.power(q, 8 * k)
|
||||
coeffs = np.array([1.0,
|
||||
k1a + k1b*msquared,
|
||||
k2a + k2b*msquared + k2c*mfour,
|
||||
k3a + k3b*msquared + k3c*mfour + k3d*msix])
|
||||
K0to3 *= qpower
|
||||
K0to3 += coeffs
|
||||
K0to3 *= q
|
||||
K0to3 *= _SQRT2PI
|
||||
# z**10 > 0 as z > 0.04
|
||||
K0to3 /= np.array([z, 6 * zfour, 72 * z**7, 6480 * z**10])
|
||||
|
||||
# Now do the other sum over the other terms, all integers k
|
||||
# K_2: (pi^2 k^2) q^(k^2),
|
||||
# K_3: (3pi^2 k^2 z^2 - pi^4 k^4)*q^(k^2)
|
||||
# Don't expect much subtractive cancellation so use direct calculation
|
||||
q = np.exp(-_PI_SQUARED / 2 / zsquared)
|
||||
ks = np.arange(maxk, 0, -1)
|
||||
ksquared = ks ** 2
|
||||
sqrt3z = _SQRT3 * z
|
||||
kspi = np.pi * ks
|
||||
qpwers = q ** ksquared
|
||||
k2extra = np.sum(ksquared * qpwers)
|
||||
k2extra *= _PI_SQUARED * _SQRT2PI/(-36 * zthree)
|
||||
K0to3[2] += k2extra
|
||||
k3extra = np.sum((sqrt3z + kspi) * (sqrt3z - kspi) * ksquared * qpwers)
|
||||
k3extra *= _PI_SQUARED * _SQRT2PI/(216 * zsix)
|
||||
K0to3[3] += k3extra
|
||||
powers_of_n = np.power(n * 1.0, np.arange(len(K0to3)) / 2.0)
|
||||
K0to3 /= powers_of_n
|
||||
|
||||
if not cdf:
|
||||
K0to3 *= -1
|
||||
K0to3[0] += 1
|
||||
|
||||
Ksum = sum(K0to3)
|
||||
return Ksum
|
||||
|
||||
|
||||
def _kolmogn(n, x, cdf=True):
|
||||
"""Computes the CDF(or SF) for the two-sided Kolmogorov-Smirnov statistic.
|
||||
|
||||
x must be of type float, n of type integer.
|
||||
|
||||
Simard & L'Ecuyer (2011) [7].
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if x >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
|
||||
if x <= 0.0:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
t = n * x
|
||||
if t <= 1.0: # Ruben-Gambino: 1/2n <= x <= 1/n
|
||||
if t <= 0.5:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
if n <= 140:
|
||||
prob = np.prod(np.arange(1, n+1) * (1.0/n) * (2*t - 1))
|
||||
else:
|
||||
prob = np.exp(_log_nfactorial_div_n_pow_n(n) + n * np.log(2*t-1))
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
if t >= n - 1: # Ruben-Gambino
|
||||
prob = 2 * (1.0 - x)**n
|
||||
return _select_and_clip_prob(1 - prob, prob, cdf=cdf)
|
||||
if x >= 0.5: # Exact: 2 * smirnov
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
|
||||
|
||||
nxsquared = t * x
|
||||
if n <= 140:
|
||||
if nxsquared <= 0.754693:
|
||||
prob = _kolmogn_DMTW(n, x, cdf=True)
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
if nxsquared <= 4:
|
||||
prob = _kolmogn_Pomeranz(n, x, cdf=True)
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
# Now use Miller approximation of 2*smirnov
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
|
||||
|
||||
# Split CDF and SF as they have different cutoffs on nxsquared.
|
||||
if not cdf:
|
||||
if nxsquared >= 370.0:
|
||||
return 0.0
|
||||
if nxsquared >= 2.2:
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _clip_prob(prob)
|
||||
# Fall through and compute the SF as 1.0-CDF
|
||||
if nxsquared >= 18.0:
|
||||
cdfprob = 1.0
|
||||
elif n <= 100000 and n * x**1.5 <= 1.4:
|
||||
cdfprob = _kolmogn_DMTW(n, x, cdf=True)
|
||||
else:
|
||||
cdfprob = _kolmogn_PelzGood(n, x, cdf=True)
|
||||
return _select_and_clip_prob(cdfprob, 1.0 - cdfprob, cdf=cdf)
|
||||
|
||||
|
||||
def _kolmogn_p(n, x):
|
||||
"""Computes the PDF for the two-sided Kolmogorov-Smirnov statistic.
|
||||
|
||||
x must be of type float, n of type integer.
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if x >= 1.0 or x <= 0:
|
||||
return 0
|
||||
t = n * x
|
||||
if t <= 1.0:
|
||||
# Ruben-Gambino: n!/n^n * (2t-1)^n -> 2 n!/n^n * n^2 * (2t-1)^(n-1)
|
||||
if t <= 0.5:
|
||||
return 0.0
|
||||
if n <= 140:
|
||||
prd = np.prod(np.arange(1, n) * (1.0 / n) * (2 * t - 1))
|
||||
else:
|
||||
prd = np.exp(_log_nfactorial_div_n_pow_n(n) + (n-1) * np.log(2 * t - 1))
|
||||
return prd * 2 * n**2
|
||||
if t >= n - 1:
|
||||
# Ruben-Gambino : 1-2(1-x)**n -> 2n*(1-x)**(n-1)
|
||||
return 2 * (1.0 - x) ** (n-1) * n
|
||||
if x >= 0.5:
|
||||
return 2 * scipy.stats.ksone.pdf(x, n)
|
||||
|
||||
# Just take a small delta.
|
||||
# Ideally x +/- delta would stay within [i/n, (i+1)/n] for some integer a.
|
||||
# as the CDF is a piecewise degree n polynomial.
|
||||
# It has knots at 1/n, 2/n, ... (n-1)/n
|
||||
# and is not a C-infinity function at the knots
|
||||
delta = x / 2.0**16
|
||||
delta = min(delta, x - 1.0/n)
|
||||
delta = min(delta, 0.5 - x)
|
||||
|
||||
def _kk(_x):
|
||||
return kolmogn(n, _x)
|
||||
|
||||
return _derivative(_kk, x, dx=delta, order=5)
|
||||
|
||||
|
||||
def _kolmogni(n, p, q):
|
||||
"""Computes the PPF/ISF of kolmogn.
|
||||
|
||||
n of type integer, n>= 1
|
||||
p is the CDF, q the SF, p+q=1
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if p <= 0:
|
||||
return 1.0/n
|
||||
if q <= 0:
|
||||
return 1.0
|
||||
delta = np.exp((np.log(p) - scipy.special.loggamma(n+1))/n)
|
||||
if delta <= 1.0/n:
|
||||
return (delta + 1.0 / n) / 2
|
||||
x = -np.expm1(np.log(q/2.0)/n)
|
||||
if x >= 1 - 1.0/n:
|
||||
return x
|
||||
x1 = scu._kolmogci(p)/np.sqrt(n)
|
||||
x1 = min(x1, 1.0 - 1.0/n)
|
||||
_f = lambda x: _kolmogn(n, x) - p
|
||||
return scipy.optimize.brentq(_f, 1.0/n, x1, xtol=1e-14)
|
||||
|
||||
|
||||
def kolmogn(n, x, cdf=True):
|
||||
"""Computes the CDF for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
The two-sided Kolmogorov-Smirnov distribution has as its CDF Pr(D_n <= x),
|
||||
for a sample of size n drawn from a distribution with CDF F(t), where
|
||||
D_n &= sup_t |F_n(t) - F(t)|, and
|
||||
F_n(t) is the Empirical Cumulative Distribution Function of the sample.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
x : float, array_like
|
||||
The K-S statistic, float between 0 and 1
|
||||
cdf : bool, optional
|
||||
whether to compute the CDF(default=true) or the SF.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cdf : ndarray
|
||||
CDF (or SF it cdf is False) at the specified locations.
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, x, cdf, None],
|
||||
op_dtypes=[None, np.float64, np.bool_, np.float64])
|
||||
for _n, _x, _cdf, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
z[...] = _kolmogn(int(_n), _x, cdf=_cdf)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
|
||||
|
||||
def kolmognp(n, x):
|
||||
"""Computes the PDF for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
x : float, array_like
|
||||
The K-S statistic, float between 0 and 1
|
||||
|
||||
Returns
|
||||
-------
|
||||
pdf : ndarray
|
||||
The PDF at the specified locations
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, x, None])
|
||||
for _n, _x, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
z[...] = _kolmogn_p(int(_n), _x)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
|
||||
|
||||
def kolmogni(n, q, cdf=True):
|
||||
"""Computes the PPF(or ISF) for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
q : float, array_like
|
||||
Probabilities, float between 0 and 1
|
||||
cdf : bool, optional
|
||||
whether to compute the PPF(default=true) or the ISF.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ppf : ndarray
|
||||
PPF (or ISF if cdf is False) at the specified locations
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, q, cdf, None])
|
||||
for _n, _q, _cdf, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
_pcdf, _psf = (_q, 1-_q) if _cdf else (1-_q, _q)
|
||||
z[...] = _kolmogni(int(_n), _pcdf, _psf)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
1200
.CondaPkg/env/Lib/site-packages/scipy/stats/_levy_stable/__init__.py
vendored
Normal file
1200
.CondaPkg/env/Lib/site-packages/scipy/stats/_levy_stable/__init__.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_levy_stable/__pycache__/__init__.cpython-311.pyc
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_levy_stable/__pycache__/__init__.cpython-311.pyc
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_levy_stable/levyst.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_levy_stable/levyst.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_levy_stable/levyst.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_levy_stable/levyst.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
493
.CondaPkg/env/Lib/site-packages/scipy/stats/_mannwhitneyu.py
vendored
Normal file
493
.CondaPkg/env/Lib/site-packages/scipy/stats/_mannwhitneyu.py
vendored
Normal file
@@ -0,0 +1,493 @@
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
from scipy import special
|
||||
from scipy import stats
|
||||
from ._axis_nan_policy import _axis_nan_policy_factory
|
||||
|
||||
|
||||
def _broadcast_concatenate(x, y, axis):
|
||||
'''Broadcast then concatenate arrays, leaving concatenation axis last'''
|
||||
x = np.moveaxis(x, axis, -1)
|
||||
y = np.moveaxis(y, axis, -1)
|
||||
z = np.broadcast(x[..., 0], y[..., 0])
|
||||
x = np.broadcast_to(x, z.shape + (x.shape[-1],))
|
||||
y = np.broadcast_to(y, z.shape + (y.shape[-1],))
|
||||
z = np.concatenate((x, y), axis=-1)
|
||||
return x, y, z
|
||||
|
||||
|
||||
class _MWU:
|
||||
'''Distribution of MWU statistic under the null hypothesis'''
|
||||
# Possible improvement: if m and n are small enough, use integer arithmetic
|
||||
|
||||
def __init__(self):
|
||||
'''Minimal initializer'''
|
||||
self._fmnks = -np.ones((1, 1, 1))
|
||||
self._recursive = None
|
||||
|
||||
def pmf(self, k, m, n):
|
||||
if (self._recursive is None and m <= 500 and n <= 500
|
||||
or self._recursive):
|
||||
return self.pmf_recursive(k, m, n)
|
||||
else:
|
||||
return self.pmf_iterative(k, m, n)
|
||||
|
||||
def pmf_recursive(self, k, m, n):
|
||||
'''Probability mass function, recursive version'''
|
||||
self._resize_fmnks(m, n, np.max(k))
|
||||
# could loop over just the unique elements, but probably not worth
|
||||
# the time to find them
|
||||
for i in np.ravel(k):
|
||||
self._f(m, n, i)
|
||||
return self._fmnks[m, n, k] / special.binom(m + n, m)
|
||||
|
||||
def pmf_iterative(self, k, m, n):
|
||||
'''Probability mass function, iterative version'''
|
||||
fmnks = {}
|
||||
for i in np.ravel(k):
|
||||
fmnks = _mwu_f_iterative(m, n, i, fmnks)
|
||||
return (np.array([fmnks[(m, n, ki)] for ki in k])
|
||||
/ special.binom(m + n, m))
|
||||
|
||||
def cdf(self, k, m, n):
|
||||
'''Cumulative distribution function'''
|
||||
# We could use the fact that the distribution is symmetric to avoid
|
||||
# summing more than m*n/2 terms, but it might not be worth the
|
||||
# overhead. Let's leave that to an improvement.
|
||||
pmfs = self.pmf(np.arange(0, np.max(k) + 1), m, n)
|
||||
cdfs = np.cumsum(pmfs)
|
||||
return cdfs[k]
|
||||
|
||||
def sf(self, k, m, n):
|
||||
'''Survival function'''
|
||||
# Use the fact that the distribution is symmetric; i.e.
|
||||
# _f(m, n, m*n-k) = _f(m, n, k), and sum from the left
|
||||
k = m*n - k
|
||||
# Note that both CDF and SF include the PMF at k. The p-value is
|
||||
# calculated from the SF and should include the mass at k, so this
|
||||
# is desirable
|
||||
return self.cdf(k, m, n)
|
||||
|
||||
def _resize_fmnks(self, m, n, k):
|
||||
'''If necessary, expand the array that remembers PMF values'''
|
||||
# could probably use `np.pad` but I'm not sure it would save code
|
||||
shape_old = np.array(self._fmnks.shape)
|
||||
shape_new = np.array((m+1, n+1, k+1))
|
||||
if np.any(shape_new > shape_old):
|
||||
shape = np.maximum(shape_old, shape_new)
|
||||
fmnks = -np.ones(shape) # create the new array
|
||||
m0, n0, k0 = shape_old
|
||||
fmnks[:m0, :n0, :k0] = self._fmnks # copy remembered values
|
||||
self._fmnks = fmnks
|
||||
|
||||
def _f(self, m, n, k):
|
||||
'''Recursive implementation of function of [3] Theorem 2.5'''
|
||||
|
||||
# [3] Theorem 2.5 Line 1
|
||||
if k < 0 or m < 0 or n < 0 or k > m*n:
|
||||
return 0
|
||||
|
||||
# if already calculated, return the value
|
||||
if self._fmnks[m, n, k] >= 0:
|
||||
return self._fmnks[m, n, k]
|
||||
|
||||
if k == 0 and m >= 0 and n >= 0: # [3] Theorem 2.5 Line 2
|
||||
fmnk = 1
|
||||
else: # [3] Theorem 2.5 Line 3 / Equation 3
|
||||
fmnk = self._f(m-1, n, k-n) + self._f(m, n-1, k)
|
||||
|
||||
self._fmnks[m, n, k] = fmnk # remember result
|
||||
|
||||
return fmnk
|
||||
|
||||
|
||||
# Maintain state for faster repeat calls to mannwhitneyu w/ method='exact'
|
||||
_mwu_state = _MWU()
|
||||
|
||||
|
||||
def _mwu_f_iterative(m, n, k, fmnks):
|
||||
'''Iterative implementation of function of [3] Theorem 2.5'''
|
||||
|
||||
def _base_case(m, n, k):
|
||||
'''Base cases from recursive version'''
|
||||
|
||||
# if already calculated, return the value
|
||||
if fmnks.get((m, n, k), -1) >= 0:
|
||||
return fmnks[(m, n, k)]
|
||||
|
||||
# [3] Theorem 2.5 Line 1
|
||||
elif k < 0 or m < 0 or n < 0 or k > m*n:
|
||||
return 0
|
||||
|
||||
# [3] Theorem 2.5 Line 2
|
||||
elif k == 0 and m >= 0 and n >= 0:
|
||||
return 1
|
||||
|
||||
return None
|
||||
|
||||
stack = [(m, n, k)]
|
||||
fmnk = None
|
||||
|
||||
while stack:
|
||||
# Popping only if necessary would save a tiny bit of time, but NWI.
|
||||
m, n, k = stack.pop()
|
||||
|
||||
# If we're at a base case, continue (stack unwinds)
|
||||
fmnk = _base_case(m, n, k)
|
||||
if fmnk is not None:
|
||||
fmnks[(m, n, k)] = fmnk
|
||||
continue
|
||||
|
||||
# If both terms are base cases, continue (stack unwinds)
|
||||
f1 = _base_case(m-1, n, k-n)
|
||||
f2 = _base_case(m, n-1, k)
|
||||
if f1 is not None and f2 is not None:
|
||||
# [3] Theorem 2.5 Line 3 / Equation 3
|
||||
fmnk = f1 + f2
|
||||
fmnks[(m, n, k)] = fmnk
|
||||
continue
|
||||
|
||||
# recurse deeper
|
||||
stack.append((m, n, k))
|
||||
if f1 is None:
|
||||
stack.append((m-1, n, k-n))
|
||||
if f2 is None:
|
||||
stack.append((m, n-1, k))
|
||||
|
||||
return fmnks
|
||||
|
||||
|
||||
def _tie_term(ranks):
|
||||
"""Tie correction term"""
|
||||
# element i of t is the number of elements sharing rank i
|
||||
_, t = np.unique(ranks, return_counts=True, axis=-1)
|
||||
return (t**3 - t).sum(axis=-1)
|
||||
|
||||
|
||||
def _get_mwu_z(U, n1, n2, ranks, axis=0, continuity=True):
|
||||
'''Standardized MWU statistic'''
|
||||
# Follows mannwhitneyu [2]
|
||||
mu = n1 * n2 / 2
|
||||
n = n1 + n2
|
||||
|
||||
# Tie correction according to [2]
|
||||
tie_term = np.apply_along_axis(_tie_term, -1, ranks)
|
||||
s = np.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1))))
|
||||
|
||||
# equivalent to using scipy.stats.tiecorrect
|
||||
# T = np.apply_along_axis(stats.tiecorrect, -1, ranks)
|
||||
# s = np.sqrt(T * n1 * n2 * (n1+n2+1) / 12.0)
|
||||
|
||||
numerator = U - mu
|
||||
|
||||
# Continuity correction.
|
||||
# Because SF is always used to calculate the p-value, we can always
|
||||
# _subtract_ 0.5 for the continuity correction. This always increases the
|
||||
# p-value to account for the rest of the probability mass _at_ q = U.
|
||||
if continuity:
|
||||
numerator -= 0.5
|
||||
|
||||
# no problem evaluating the norm SF at an infinity
|
||||
with np.errstate(divide='ignore', invalid='ignore'):
|
||||
z = numerator / s
|
||||
return z
|
||||
|
||||
|
||||
def _mwu_input_validation(x, y, use_continuity, alternative, axis, method):
|
||||
''' Input validation and standardization for mannwhitneyu '''
|
||||
# Would use np.asarray_chkfinite, but infs are OK
|
||||
x, y = np.atleast_1d(x), np.atleast_1d(y)
|
||||
if np.isnan(x).any() or np.isnan(y).any():
|
||||
raise ValueError('`x` and `y` must not contain NaNs.')
|
||||
if np.size(x) == 0 or np.size(y) == 0:
|
||||
raise ValueError('`x` and `y` must be of nonzero size.')
|
||||
|
||||
bools = {True, False}
|
||||
if use_continuity not in bools:
|
||||
raise ValueError(f'`use_continuity` must be one of {bools}.')
|
||||
|
||||
alternatives = {"two-sided", "less", "greater"}
|
||||
alternative = alternative.lower()
|
||||
if alternative not in alternatives:
|
||||
raise ValueError(f'`alternative` must be one of {alternatives}.')
|
||||
|
||||
axis_int = int(axis)
|
||||
if axis != axis_int:
|
||||
raise ValueError('`axis` must be an integer.')
|
||||
|
||||
methods = {"asymptotic", "exact", "auto"}
|
||||
method = method.lower()
|
||||
if method not in methods:
|
||||
raise ValueError(f'`method` must be one of {methods}.')
|
||||
|
||||
return x, y, use_continuity, alternative, axis_int, method
|
||||
|
||||
|
||||
def _tie_check(xy):
|
||||
"""Find any ties in data"""
|
||||
_, t = np.unique(xy, return_counts=True, axis=-1)
|
||||
return np.any(t != 1)
|
||||
|
||||
|
||||
def _mwu_choose_method(n1, n2, xy, method):
|
||||
"""Choose method 'asymptotic' or 'exact' depending on input size, ties"""
|
||||
|
||||
# if both inputs are large, asymptotic is OK
|
||||
if n1 > 8 and n2 > 8:
|
||||
return "asymptotic"
|
||||
|
||||
# if there are any ties, asymptotic is preferred
|
||||
if np.apply_along_axis(_tie_check, -1, xy).any():
|
||||
return "asymptotic"
|
||||
|
||||
return "exact"
|
||||
|
||||
|
||||
MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
|
||||
|
||||
|
||||
@_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2)
|
||||
def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",
|
||||
axis=0, method="auto"):
|
||||
r'''Perform the Mann-Whitney U rank test on two independent samples.
|
||||
|
||||
The Mann-Whitney U test is a nonparametric test of the null hypothesis
|
||||
that the distribution underlying sample `x` is the same as the
|
||||
distribution underlying sample `y`. It is often used as a test of
|
||||
difference in location between distributions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array-like
|
||||
N-d arrays of samples. The arrays must be broadcastable except along
|
||||
the dimension given by `axis`.
|
||||
use_continuity : bool, optional
|
||||
Whether a continuity correction (1/2) should be applied.
|
||||
Default is True when `method` is ``'asymptotic'``; has no effect
|
||||
otherwise.
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis. Default is 'two-sided'.
|
||||
Let *F(u)* and *G(u)* be the cumulative distribution functions of the
|
||||
distributions underlying `x` and `y`, respectively. Then the following
|
||||
alternative hypotheses are available:
|
||||
|
||||
* 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
|
||||
at least one *u*.
|
||||
* 'less': the distribution underlying `x` is stochastically less
|
||||
than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*.
|
||||
* 'greater': the distribution underlying `x` is stochastically greater
|
||||
than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*.
|
||||
|
||||
Under a more restrictive set of assumptions, the alternative hypotheses
|
||||
can be expressed in terms of the locations of the distributions;
|
||||
see [5] section 5.1.
|
||||
axis : int, optional
|
||||
Axis along which to perform the test. Default is 0.
|
||||
method : {'auto', 'asymptotic', 'exact'}, optional
|
||||
Selects the method used to calculate the *p*-value.
|
||||
Default is 'auto'. The following options are available.
|
||||
|
||||
* ``'asymptotic'``: compares the standardized test statistic
|
||||
against the normal distribution, correcting for ties.
|
||||
* ``'exact'``: computes the exact *p*-value by comparing the observed
|
||||
:math:`U` statistic against the exact distribution of the :math:`U`
|
||||
statistic under the null hypothesis. No correction is made for ties.
|
||||
* ``'auto'``: chooses ``'exact'`` when the size of one of the samples
|
||||
is less than 8 and there are no ties; chooses ``'asymptotic'``
|
||||
otherwise.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : MannwhitneyuResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
The Mann-Whitney U statistic corresponding with sample `x`. See
|
||||
Notes for the test statistic corresponding with sample `y`.
|
||||
pvalue : float
|
||||
The associated *p*-value for the chosen `alternative`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
If ``U1`` is the statistic corresponding with sample `x`, then the
|
||||
statistic corresponding with sample `y` is
|
||||
`U2 = `x.shape[axis] * y.shape[axis] - U1``.
|
||||
|
||||
`mannwhitneyu` is for independent samples. For related / paired samples,
|
||||
consider `scipy.stats.wilcoxon`.
|
||||
|
||||
`method` ``'exact'`` is recommended when there are no ties and when either
|
||||
sample size is less than 8 [1]_. The implementation follows the recurrence
|
||||
relation originally proposed in [1]_ as it is described in [3]_.
|
||||
Note that the exact method is *not* corrected for ties, but
|
||||
`mannwhitneyu` will not raise errors or warnings if there are ties in the
|
||||
data.
|
||||
|
||||
The Mann-Whitney U test is a non-parametric version of the t-test for
|
||||
independent samples. When the means of samples from the populations
|
||||
are normally distributed, consider `scipy.stats.ttest_ind`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random
|
||||
variables is stochastically larger than the other", The Annals of
|
||||
Mathematical Statistics, Vol. 18, pp. 50-60, 1947.
|
||||
.. [2] Mann-Whitney U Test, Wikipedia,
|
||||
http://en.wikipedia.org/wiki/Mann-Whitney_U_test
|
||||
.. [3] A. Di Bucchianico, "Combinatorics, computer algebra, and the
|
||||
Wilcoxon-Mann-Whitney test", Journal of Statistical Planning and
|
||||
Inference, Vol. 79, pp. 349-364, 1999.
|
||||
.. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics
|
||||
Learning Support Centre, 2004.
|
||||
.. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney
|
||||
or t-test? On assumptions for hypothesis tests and multiple \
|
||||
interpretations of decision rules." Statistics surveys, Vol. 4, pp.
|
||||
1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/
|
||||
|
||||
Examples
|
||||
--------
|
||||
We follow the example from [4]_: nine randomly sampled young adults were
|
||||
diagnosed with type II diabetes at the ages below.
|
||||
|
||||
>>> males = [19, 22, 16, 29, 24]
|
||||
>>> females = [20, 11, 17, 12]
|
||||
|
||||
We use the Mann-Whitney U test to assess whether there is a statistically
|
||||
significant difference in the diagnosis age of males and females.
|
||||
The null hypothesis is that the distribution of male diagnosis ages is
|
||||
the same as the distribution of female diagnosis ages. We decide
|
||||
that a confidence level of 95% is required to reject the null hypothesis
|
||||
in favor of the alternative that the distributions are different.
|
||||
Since the number of samples is very small and there are no ties in the
|
||||
data, we can compare the observed test statistic against the *exact*
|
||||
distribution of the test statistic under the null hypothesis.
|
||||
|
||||
>>> from scipy.stats import mannwhitneyu
|
||||
>>> U1, p = mannwhitneyu(males, females, method="exact")
|
||||
>>> print(U1)
|
||||
17.0
|
||||
|
||||
`mannwhitneyu` always reports the statistic associated with the first
|
||||
sample, which, in this case, is males. This agrees with :math:`U_M = 17`
|
||||
reported in [4]_. The statistic associated with the second statistic
|
||||
can be calculated:
|
||||
|
||||
>>> nx, ny = len(males), len(females)
|
||||
>>> U2 = nx*ny - U1
|
||||
>>> print(U2)
|
||||
3.0
|
||||
|
||||
This agrees with :math:`U_F = 3` reported in [4]_. The two-sided
|
||||
*p*-value can be calculated from either statistic, and the value produced
|
||||
by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.
|
||||
|
||||
>>> print(p)
|
||||
0.1111111111111111
|
||||
|
||||
The exact distribution of the test statistic is asymptotically normal, so
|
||||
the example continues by comparing the exact *p*-value against the
|
||||
*p*-value produced using the normal approximation.
|
||||
|
||||
>>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")
|
||||
>>> print(pnorm)
|
||||
0.11134688653314041
|
||||
|
||||
Here `mannwhitneyu`'s reported *p*-value appears to conflict with the
|
||||
value :math:`p = 0.09` given in [4]_. The reason is that [4]_
|
||||
does not apply the continuity correction performed by `mannwhitneyu`;
|
||||
`mannwhitneyu` reduces the distance between the test statistic and the
|
||||
mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the
|
||||
discrete statistic is being compared against a continuous distribution.
|
||||
Here, the :math:`U` statistic used is less than the mean, so we reduce
|
||||
the distance by adding 0.5 in the numerator.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import norm
|
||||
>>> U = min(U1, U2)
|
||||
>>> N = nx + ny
|
||||
>>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
|
||||
>>> p = 2 * norm.cdf(z) # use CDF to get p-value from smaller statistic
|
||||
>>> print(p)
|
||||
0.11134688653314041
|
||||
|
||||
If desired, we can disable the continuity correction to get a result
|
||||
that agrees with that reported in [4]_.
|
||||
|
||||
>>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,
|
||||
... method="asymptotic")
|
||||
>>> print(pnorm)
|
||||
0.0864107329737
|
||||
|
||||
Regardless of whether we perform an exact or asymptotic test, the
|
||||
probability of the test statistic being as extreme or more extreme by
|
||||
chance exceeds 5%, so we do not consider the results statistically
|
||||
significant.
|
||||
|
||||
Suppose that, before seeing the data, we had hypothesized that females
|
||||
would tend to be diagnosed at a younger age than males.
|
||||
In that case, it would be natural to provide the female ages as the
|
||||
first input, and we would have performed a one-sided test using
|
||||
``alternative = 'less'``: females are diagnosed at an age that is
|
||||
stochastically less than that of males.
|
||||
|
||||
>>> res = mannwhitneyu(females, males, alternative="less", method="exact")
|
||||
>>> print(res)
|
||||
MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)
|
||||
|
||||
Again, the probability of getting a sufficiently low value of the
|
||||
test statistic by chance under the null hypothesis is greater than 5%,
|
||||
so we do not reject the null hypothesis in favor of our alternative.
|
||||
|
||||
If it is reasonable to assume that the means of samples from the
|
||||
populations are normally distributed, we could have used a t-test to
|
||||
perform the analysis.
|
||||
|
||||
>>> from scipy.stats import ttest_ind
|
||||
>>> res = ttest_ind(females, males, alternative="less")
|
||||
>>> print(res)
|
||||
Ttest_indResult(statistic=-2.239334696520584, pvalue=0.030068441095757924)
|
||||
|
||||
Under this assumption, the *p*-value would be low enough to reject the
|
||||
null hypothesis in favor of the alternative.
|
||||
|
||||
'''
|
||||
|
||||
x, y, use_continuity, alternative, axis_int, method = (
|
||||
_mwu_input_validation(x, y, use_continuity, alternative, axis, method))
|
||||
|
||||
x, y, xy = _broadcast_concatenate(x, y, axis)
|
||||
|
||||
n1, n2 = x.shape[-1], y.shape[-1]
|
||||
|
||||
if method == "auto":
|
||||
method = _mwu_choose_method(n1, n2, xy, method)
|
||||
|
||||
# Follows [2]
|
||||
ranks = stats.rankdata(xy, axis=-1) # method 2, step 1
|
||||
R1 = ranks[..., :n1].sum(axis=-1) # method 2, step 2
|
||||
U1 = R1 - n1*(n1+1)/2 # method 2, step 3
|
||||
U2 = n1 * n2 - U1 # as U1 + U2 = n1 * n2
|
||||
|
||||
if alternative == "greater":
|
||||
U, f = U1, 1 # U is the statistic to use for p-value, f is a factor
|
||||
elif alternative == "less":
|
||||
U, f = U2, 1 # Due to symmetry, use SF of U2 rather than CDF of U1
|
||||
else:
|
||||
U, f = np.maximum(U1, U2), 2 # multiply SF by two for two-sided test
|
||||
|
||||
if method == "exact":
|
||||
p = _mwu_state.sf(U.astype(int), n1, n2)
|
||||
elif method == "asymptotic":
|
||||
z = _get_mwu_z(U, n1, n2, ranks, continuity=use_continuity)
|
||||
p = stats.norm.sf(z)
|
||||
p *= f
|
||||
|
||||
# Ensure that test statistic is not greater than 1
|
||||
# This could happen for exact test when U = m*n/2
|
||||
p = np.clip(p, 0, 1)
|
||||
|
||||
return MannwhitneyuResult(U1, p)
|
||||
4186
.CondaPkg/env/Lib/site-packages/scipy/stats/_morestats.py
vendored
Normal file
4186
.CondaPkg/env/Lib/site-packages/scipy/stats/_morestats.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3521
.CondaPkg/env/Lib/site-packages/scipy/stats/_mstats_basic.py
vendored
Normal file
3521
.CondaPkg/env/Lib/site-packages/scipy/stats/_mstats_basic.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
500
.CondaPkg/env/Lib/site-packages/scipy/stats/_mstats_extras.py
vendored
Normal file
500
.CondaPkg/env/Lib/site-packages/scipy/stats/_mstats_extras.py
vendored
Normal file
@@ -0,0 +1,500 @@
|
||||
"""
|
||||
Additional statistics functions with support for masked arrays.
|
||||
|
||||
"""
|
||||
|
||||
# Original author (2007): Pierre GF Gerard-Marchant
|
||||
|
||||
|
||||
__all__ = ['compare_medians_ms',
|
||||
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
|
||||
'idealfourths',
|
||||
'median_cihs','mjci','mquantiles_cimj',
|
||||
'rsh',
|
||||
'trimmed_mean_ci',]
|
||||
|
||||
|
||||
import numpy as np
|
||||
from numpy import float_, int_, ndarray
|
||||
|
||||
import numpy.ma as ma
|
||||
from numpy.ma import MaskedArray
|
||||
|
||||
from . import _mstats_basic as mstats
|
||||
|
||||
from scipy.stats.distributions import norm, beta, t, binom
|
||||
|
||||
|
||||
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
|
||||
"""
|
||||
Computes quantile estimates with the Harrell-Davis method.
|
||||
|
||||
The quantile estimates are calculated as a weighted linear combination
|
||||
of order statistics.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : bool, optional
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles : MaskedArray
|
||||
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
|
||||
quantiles and variances (if `var` is True), where ``p`` is the
|
||||
number of quantiles.
|
||||
|
||||
See Also
|
||||
--------
|
||||
hdquantiles_sd
|
||||
|
||||
"""
|
||||
def _hd_1D(data,prob,var):
|
||||
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
|
||||
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
|
||||
# Don't use length here, in case we have a numpy scalar
|
||||
n = xsorted.size
|
||||
|
||||
hd = np.empty((2,len(prob)), float_)
|
||||
if n < 2:
|
||||
hd.flat = np.nan
|
||||
if var:
|
||||
return hd
|
||||
return hd[0]
|
||||
|
||||
v = np.arange(n+1) / float(n)
|
||||
betacdf = beta.cdf
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
hd_mean = np.dot(w, xsorted)
|
||||
hd[0,i] = hd_mean
|
||||
#
|
||||
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
|
||||
#
|
||||
hd[0, prob == 0] = xsorted[0]
|
||||
hd[0, prob == 1] = xsorted[-1]
|
||||
if var:
|
||||
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
|
||||
return hd
|
||||
return hd[0]
|
||||
# Initialization & checks
|
||||
data = ma.array(data, copy=False, dtype=float_)
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None) or (data.ndim == 1):
|
||||
result = _hd_1D(data, p, var)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
|
||||
|
||||
return ma.fix_invalid(result, copy=False)
|
||||
|
||||
|
||||
def hdmedian(data, axis=-1, var=False):
|
||||
"""
|
||||
Returns the Harrell-Davis estimate of the median along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
axis : int, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : bool, optional
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdmedian : MaskedArray
|
||||
The median values. If ``var=True``, the variance is returned inside
|
||||
the masked array. E.g. for a 1-D array the shape change from (1,) to
|
||||
(2,).
|
||||
|
||||
"""
|
||||
result = hdquantiles(data,[0.5], axis=axis, var=var)
|
||||
return result.squeeze()
|
||||
|
||||
|
||||
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
|
||||
"""
|
||||
The standard error of the Harrell-Davis quantile estimates by jackknife.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles_sd : MaskedArray
|
||||
Standard error of the Harrell-Davis quantile estimates.
|
||||
|
||||
See Also
|
||||
--------
|
||||
hdquantiles
|
||||
|
||||
"""
|
||||
def _hdsd_1D(data, prob):
|
||||
"Computes the std error for 1D arrays."
|
||||
xsorted = np.sort(data.compressed())
|
||||
n = len(xsorted)
|
||||
|
||||
hdsd = np.empty(len(prob), float_)
|
||||
if n < 2:
|
||||
hdsd.flat = np.nan
|
||||
|
||||
vv = np.arange(n) / float(n-1)
|
||||
betacdf = beta.cdf
|
||||
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(vv, n*p, n*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
# cumulative sum of weights and data points if
|
||||
# ith point is left out for jackknife
|
||||
mx_ = np.zeros_like(xsorted)
|
||||
mx_[1:] = np.cumsum(w * xsorted[:-1])
|
||||
# similar but from the right
|
||||
mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
|
||||
hdsd[i] = np.sqrt(mx_.var() * (n - 1))
|
||||
return hdsd
|
||||
|
||||
# Initialization & checks
|
||||
data = ma.array(data, copy=False, dtype=float_)
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _hdsd_1D(data, p)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
|
||||
|
||||
return ma.fix_invalid(result, copy=False).ravel()
|
||||
|
||||
|
||||
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
|
||||
alpha=0.05, axis=None):
|
||||
"""
|
||||
Selected confidence interval of the trimmed mean along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data.
|
||||
limits : {None, tuple}, optional
|
||||
None or a two item tuple.
|
||||
Tuple of the percentages to cut on each side of the array, with respect
|
||||
to the number of unmasked data, as floats between 0. and 1. If ``n``
|
||||
is the number of unmasked data before trimming, then
|
||||
(``n * limits[0]``)th smallest data and (``n * limits[1]``)th
|
||||
largest data are masked. The total number of unmasked data after
|
||||
trimming is ``n * (1. - sum(limits))``.
|
||||
The value of one limit can be set to None to indicate an open interval.
|
||||
|
||||
Defaults to (0.2, 0.2).
|
||||
inclusive : (2,) tuple of boolean, optional
|
||||
If relative==False, tuple indicating whether values exactly equal to
|
||||
the absolute limits are allowed.
|
||||
If relative==True, tuple indicating whether the number of data being
|
||||
masked on each side should be rounded (True) or truncated (False).
|
||||
|
||||
Defaults to (True, True).
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
|
||||
Defaults to 0.05.
|
||||
axis : int, optional
|
||||
Axis along which to cut. If None, uses a flattened version of `data`.
|
||||
|
||||
Defaults to None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
trimmed_mean_ci : (2,) ndarray
|
||||
The lower and upper confidence intervals of the trimmed data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
|
||||
tmean = trimmed.mean(axis)
|
||||
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
|
||||
df = trimmed.count(axis) - 1
|
||||
tppf = t.ppf(1-alpha/2.,df)
|
||||
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
|
||||
|
||||
|
||||
def mjci(data, prob=[0.25,0.5,0.75], axis=None):
|
||||
"""
|
||||
Returns the Maritz-Jarrett estimators of the standard error of selected
|
||||
experimental quantiles of the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
"""
|
||||
def _mjci_1D(data, p):
|
||||
data = np.sort(data.compressed())
|
||||
n = data.size
|
||||
prob = (np.array(p) * n + 0.5).astype(int_)
|
||||
betacdf = beta.cdf
|
||||
|
||||
mj = np.empty(len(prob), float_)
|
||||
x = np.arange(1,n+1, dtype=float_) / n
|
||||
y = x - 1./n
|
||||
for (i,m) in enumerate(prob):
|
||||
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
|
||||
C1 = np.dot(W,data)
|
||||
C2 = np.dot(W,data**2)
|
||||
mj[i] = np.sqrt(C2 - C1**2)
|
||||
return mj
|
||||
|
||||
data = ma.array(data, copy=False)
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
|
||||
p = np.array(prob, copy=False, ndmin=1)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
return _mjci_1D(data, p)
|
||||
else:
|
||||
return ma.apply_along_axis(_mjci_1D, axis, data, p)
|
||||
|
||||
|
||||
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha confidence interval for the selected quantiles of the
|
||||
data, with Maritz-Jarrett estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles.
|
||||
If None, use a flattened array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci_lower : ndarray
|
||||
The lower boundaries of the confidence interval. Of the same length as
|
||||
`prob`.
|
||||
ci_upper : ndarray
|
||||
The upper boundaries of the confidence interval. Of the same length as
|
||||
`prob`.
|
||||
|
||||
"""
|
||||
alpha = min(alpha, 1 - alpha)
|
||||
z = norm.ppf(1 - alpha/2.)
|
||||
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
|
||||
smj = mjci(data, prob, axis=axis)
|
||||
return (xq - z * smj, xq + z * smj)
|
||||
|
||||
|
||||
def median_cihs(data, alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha-level confidence interval for the median of the data.
|
||||
|
||||
Uses the Hettmasperger-Sheather method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data. Masked values are discarded. The input should be 1D only,
|
||||
or `axis` should be set to None.
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
median_cihs
|
||||
Alpha level confidence interval.
|
||||
|
||||
"""
|
||||
def _cihs_1D(data, alpha):
|
||||
data = np.sort(data.compressed())
|
||||
n = len(data)
|
||||
alpha = min(alpha, 1-alpha)
|
||||
k = int(binom._ppf(alpha/2., n, 0.5))
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
if gk < 1-alpha:
|
||||
k -= 1
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
|
||||
I = (gk - 1 + alpha)/(gk - gkk)
|
||||
lambd = (n-k) * I / float(k + (n-2*k)*I)
|
||||
lims = (lambd*data[k] + (1-lambd)*data[k-1],
|
||||
lambd*data[n-k-1] + (1-lambd)*data[n-k])
|
||||
return lims
|
||||
data = ma.array(data, copy=False)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _cihs_1D(data, alpha)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def compare_medians_ms(group_1, group_2, axis=None):
|
||||
"""
|
||||
Compares the medians from two independent groups along the given axis.
|
||||
|
||||
The comparison is performed using the McKean-Schrader estimate of the
|
||||
standard error of the medians.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
group_1 : array_like
|
||||
First dataset. Has to be of size >=7.
|
||||
group_2 : array_like
|
||||
Second dataset. Has to be of size >=7.
|
||||
axis : int, optional
|
||||
Axis along which the medians are estimated. If None, the arrays are
|
||||
flattened. If `axis` is not None, then `group_1` and `group_2`
|
||||
should have the same shape.
|
||||
|
||||
Returns
|
||||
-------
|
||||
compare_medians_ms : {float, ndarray}
|
||||
If `axis` is None, then returns a float, otherwise returns a 1-D
|
||||
ndarray of floats with a length equal to the length of `group_1`
|
||||
along `axis`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> a = [1, 2, 3, 4, 5, 6, 7]
|
||||
>>> b = [8, 9, 10, 11, 12, 13, 14]
|
||||
>>> stats.mstats.compare_medians_ms(a, b, axis=None)
|
||||
1.0693225866553746e-05
|
||||
|
||||
The function is vectorized to compute along a given axis.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> x = rng.random(size=(3, 7))
|
||||
>>> y = rng.random(size=(3, 8))
|
||||
>>> stats.mstats.compare_medians_ms(x, y, axis=1)
|
||||
array([0.36908985, 0.36092538, 0.2765313 ])
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
|
||||
for studentizing the sample median." Communications in
|
||||
Statistics-Simulation and Computation 13.6 (1984): 751-773.
|
||||
|
||||
"""
|
||||
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
|
||||
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
|
||||
mstats.stde_median(group_2, axis=axis))
|
||||
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
|
||||
return 1 - norm.cdf(W)
|
||||
|
||||
|
||||
def idealfourths(data, axis=None):
|
||||
"""
|
||||
Returns an estimate of the lower and upper quartiles.
|
||||
|
||||
Uses the ideal fourths algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input array.
|
||||
axis : int, optional
|
||||
Axis along which the quartiles are estimated. If None, the arrays are
|
||||
flattened.
|
||||
|
||||
Returns
|
||||
-------
|
||||
idealfourths : {list of floats, masked array}
|
||||
Returns the two internal values that divide `data` into four parts
|
||||
using the ideal fourths algorithm either along the flattened array
|
||||
(if `axis` is None) or along `axis` of `data`.
|
||||
|
||||
"""
|
||||
def _idf(data):
|
||||
x = data.compressed()
|
||||
n = len(x)
|
||||
if n < 3:
|
||||
return [np.nan,np.nan]
|
||||
(j,h) = divmod(n/4. + 5/12.,1)
|
||||
j = int(j)
|
||||
qlo = (1-h)*x[j-1] + h*x[j]
|
||||
k = n - j
|
||||
qup = (1-h)*x[k] + h*x[k-1]
|
||||
return [qlo, qup]
|
||||
data = ma.sort(data, axis=axis).view(MaskedArray)
|
||||
if (axis is None):
|
||||
return _idf(data)
|
||||
else:
|
||||
return ma.apply_along_axis(_idf, axis, data)
|
||||
|
||||
|
||||
def rsh(data, points=None):
|
||||
"""
|
||||
Evaluates Rosenblatt's shifted histogram estimators for each data point.
|
||||
|
||||
Rosenblatt's estimator is a centered finite-difference approximation to the
|
||||
derivative of the empirical cumulative distribution function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : sequence
|
||||
Input data, should be 1-D. Masked values are ignored.
|
||||
points : sequence or None, optional
|
||||
Sequence of points where to evaluate Rosenblatt shifted histogram.
|
||||
If None, use the data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
if points is None:
|
||||
points = data
|
||||
else:
|
||||
points = np.array(points, copy=False, ndmin=1)
|
||||
|
||||
if data.ndim != 1:
|
||||
raise AttributeError("The input array should be 1D only !")
|
||||
|
||||
n = data.count()
|
||||
r = idealfourths(data, axis=None)
|
||||
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
|
||||
nhi = (data[:,None] <= points[None,:] + h).sum(0)
|
||||
nlo = (data[:,None] < points[None,:] - h).sum(0)
|
||||
return (nhi-nlo) / (2.*n*h)
|
||||
5691
.CondaPkg/env/Lib/site-packages/scipy/stats/_multivariate.py
vendored
Normal file
5691
.CondaPkg/env/Lib/site-packages/scipy/stats/_multivariate.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_mvn.cp311-win_amd64.dll.a
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_mvn.cp311-win_amd64.dll.a
vendored
Normal file
Binary file not shown.
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_mvn.cp311-win_amd64.pyd
vendored
Normal file
BIN
.CondaPkg/env/Lib/site-packages/scipy/stats/_mvn.cp311-win_amd64.pyd
vendored
Normal file
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user