stats in python

Unsurprisingly, math fits better in my head as code. Here are some statistics functions I’ve written to better understand.

from statistics import NormalDist
from math import sqrt
from scipy.stats import t
 
def get_zscore_for_datapoint(x,mean,stddev):
    return (x-mean)/stddev
 
def prob_between_zscore(z1,z2):
    return NormalDist().cdf(z1) - NormalDist().cdf(z2)
 
 
def get_value_from_percentile(percentile, mean, stddev):
    return NormalDist().inv_cdf(percentile) * stddev + mean
 
def get_critical_t_value(df, confidence):
    # ppf = percent point function (inverse of cdf)
    # On the calculator, this is "Inverse t"
    return t.ppf(half_confidence_tail(confidence), df)
 
def half_confidence_tail(confidence):
    # TODO handle .95 in addition to 95
    alpha_level = (100-confidence)/2
    return .01 * (100-alpha_level)
 
def get_confidence_interval(mean, stddev, n, confidence):
    x = mean
    m = margin_of_error(stddev, n, confidence)
    return (x-m, x+m)
 
def t_critical_value(alpha,n):
    "Note: You need to split alpha in half if it's a double tail test"
    # This is the same as t-inverse.
    return t.ppf(1-alpha,n-1)
 
def test_statistic(u, u0, s, n):
    return (u-u0)/(s/sqrt(n))
 
## UNSURE about what's below.
def get_sample_size_from_confidence(confidence, mean, stddev, margin_of_error):
    "Probably should round up as well"
    z = NormalDist().inv_cdf(half_confidence_tail(confidence))
    return (z*stddev/ margin_of_error)**2
 
def margin_of_error(stddev, n, confidence):
    if n < 30:
        # t-statistic
        df = n-1
        crit_t = get_critical_t_value(df, confidence)
        margin_of_error = crit_t * stddev / sqrt(n)
    else:
        # z-statistic
        z = NormalDist().inv_cdf(half_confidence_tail(confidence))
        margin_of_error = z*stddev/sqrt(n)
    return margin_of_error
 
def in_critical_region(critical_value, n, alpha):
    # TODO: I'm not certain < is always correct?
 
    if n < 30:
        return t.cdf(critical_value, n-1) < alpha
    else:
        return NormalDist().cdf(critical_value)  < alpha
 
a,b = get_confidence_interval(25.1, 12.2, 49, 99)
assert (round(a,1), round(b,1)) == (20.4, 29.8)
 
def calc_pvalue():
    "Nah. Use the t-test or z-test on your calculator"
 
def stddev(l):
    mean = sum(l)/len(l)
    return sqrt(sum([(x-mean)**2 for x in l])/(len(l)-1))

The notes of Justin Abrahms

Recently updated

latency is not normal(ly distributed)

incident severity

Standard Deviation

Explorer

stats in python

Graph View

Backlinks