Lecture 15 - Fitting distributions to data

(previous, next)



More asymptotic distributions

Using methods similar to what we did for the Poisson distribution, we can derive other asymptotic approximations to distributions.

Normal aka Gaussian

\[\frac{1}{\sigma\sqrt{2\pi} } \; e^{ -\frac{(x-\mu)^2}{2\sigma^2} }\]

Fisher--Tippett--Gumbel extreme value theory

Others

Many of the common probability distributions have special relationships between each other -- see Distribution relationships.

Examples of applications of Poisson

Now, let's return to our Poisson distribution, which we've derived as a possible model of bolides burning up in earths atmosphere. We'd like to see if the Poisson distribution does a good job of matching the observations of bolides. But we've got a catch, in that we don't know what the parameter \(\lambda\)'s true value is. How can we compare our prediction to our data if we don't know \(\lambda\)?

Well, one answer is to just try many values of \(\lambda\) and look for the one that does the best job of fitting the data. Let's look at some classical examples.

Deaths from horse-kicks

Here is a classic example of something that seems totally unrelated by can be explained by the Poisson distribution. Notice that in this case, the poisson distribution only fails to explain about 4 percent of the distribution's mass when the rate parameter is picked optimally. That's pretty good, given the "totally random" nature of the process.

Annual deaths from horse kicks in the Prussian army (1875-1894)
Deaths Record
0 144
1 91
2 32
3 11
4 2
5 0
6 0

(Bulmer's discussion of this data and it's source, tabulated data file and the raw data file)

[Show code]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python

"""
This script creates an animation showing how
a distribution can be fit to a data.

First, we simulate or load a data set.  Then we
compare the data to binomial distributions with
a range of success probabilities.
"""

from scipy import floor, cumsum, array, linspace, cos, pi, rand, loadtxt
from scipy.stats import poisson
from scipy.optimize import fminbound
from matplotlib.pyplot import *
import time

def norm(x,e):
    return sum(abs(x)**e)**(1./e)

def rndBinomial(p, n, replicas=1):
    if replicas == 1:
        return int(sum(floor(rand(n) + p)))
    trials = [ rndBinomial(p,n) for i in xrange(replicas)]
    return trials

def main():
    # load data
    data = loadtxt('horse_kicks.txt',delimiter=',')
    x, y = map(array, zip(*data))
    N = sum(y)

    Y = y*1./N
    # distribution fitting
    def fit_error(L_guess):
        # calculate the distance between our guess and the data
        z = poisson.pmf(x, L_guess)
        return norm(Y-z, 1)/norm(Y,1) # can be any p-norm
    L_best = fminbound(fit_error, 1e-5, 3-1e-5)
    err_best = fit_error(L_best)
    print "Best fit: L = %f,  error = %f"%(L_best, fit_error(L_best))

    # generate data for animation
    F = [ (p, fit_error(p)) for p in linspace(1e-4,3-1e-4, 137) ]
    u, v = map(array, zip(*F))

    # draw basic figure
    fig = figure()

    subplot(2,1,2)
    plot(u, v, 'b-')
    #plot( L_best, fit_error(L_best), 'ko')
    marker_fit_error = plot( u[2], v[2], 'ro').pop()
    ylim(0,1)
    xlabel('Poisson intensity $\lambda$')

    subplot(2,1,1)
    plot(x, y, 'ko')
    z = poisson.pmf(x, L_best)
    width = .3
    xw = x-width*.5
    bar_ob = bar(xw, z*N, width, color='r')
    title('N = %d observations, $\lambda_{best} = %.3f$, $Err_{best} = %1.2g$'%(N, L_best, err_best))
    ylabel('Number of times observed')
    xlabel('Number of successes')
    xlim(-0.5,max(x)+.5)
    ylim(0,max(y)*1.2)

    show(block=False)

    i, di = 0, 1
    i_max = len(u)
    j = 0
    while True:
        marker_fit_error.set_data([ u[i] ], [v[i]])
        z = poisson.pmf(x, u[i])*N
        for rect, h in zip(bar_ob, z):
            rect.set_height(h)
        fig.canvas.draw()
        savefig('frame%04d.tif'%j)
        j += 1
        i += di
        if i == i_max:
            di = -1
            i += 2*di
        elif i == 0:
            break

main()

Accidents in a factory

Another example, which seems like it should also be a Poisson distribution, was collected for the rates of accidents in an ammunition factory.(Not a place you want to be having accidents!) In this case, though, the best explanation leaves more than 30 percent of that probability mass unexplained, even at the best fit -- okay, but nowhere near as satisfying as the previous example. Instead, the authors originally proposed the data are really obeying a negative-binomial distribution.

Number of munitions accidents in WWI factor per person
Count Observed
0 447
1 132
2 42
3 21
4 3
5 2
Over 5 0
Sum 647

(Raw data file link, Bulmer's description of this data )

[Show code]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python

"""
This script creates an animation showing how
a distribution can be fit to a data.

First, we simulate or load a data set.  Then we
compare the data to binomial distributions with
a range of success probabilities.
"""

from scipy import floor, cumsum, array, linspace, cos, pi, rand, loadtxt
from scipy.stats import poisson
from scipy.optimize import fminbound
from matplotlib.pyplot import *
import time

def norm(x,e):
    return sum(abs(x)**e)**(1./e)

def rndBinomial(p, n, replicas=1):
    if replicas == 1:
        return int(sum(floor(rand(n) + p)))
    trials = [ rndBinomial(p,n) for i in xrange(replicas)]
    return trials

def main():
    # load data
    data = loadtxt('munitions_factory.data',delimiter=',')
    x, y = map(array, zip(*data))
    N = sum(y)

    Y = y*1./N
    # distribution fitting
    def fit_error(L_guess):
        # calculate the distance between our guess and the data
        z = poisson.pmf(x, L_guess)
        return norm(Y-z, 1)/norm(Y,1) # can be any p-norm
    L_best = fminbound(fit_error, 1e-5, 3-1e-5)
    err_best = fit_error(L_best)
    print "Best fit: L = %f,  error = %f"%(L_best, fit_error(L_best))

    # generate data for animation
    F = [ (p, fit_error(p)) for p in linspace(1e-4,3-1e-4, 137) ]
    u, v = map(array, zip(*F))

    # draw basic figure
    fig = figure()

    subplot(2,1,2)
    plot(u, v, 'b-')
    #plot( L_best, fit_error(L_best), 'ko')
    marker_fit_error = plot( u[2], v[2], 'ro').pop()
    ylim(0,1)
    xlabel('Poisson intensity $\lambda$')

    subplot(2,1,1)
    plot(x, y, 'ko')
    z = poisson.pmf(x, L_best)
    width = .3
    xw = x-width*.5
    bar_ob = bar(xw, z*N, width, color='r')
    title('N = %d observations, $\lambda_{best} = %.3f$, $Err_{best} = %1.2g$'%(N, L_best, err_best))
    ylabel('Number of times observed')
    xlabel('Number of successes')
    xlim(-0.5,max(x)+.5)
    ylim(0,max(y)*1.2)

    show(block=False)

    i, di = 0, 1
    i_max = len(u)
    j = 0
    while True:
        marker_fit_error.set_data([ u[i] ], [v[i]])
        z = poisson.pmf(x, u[i])*N
        for rect, h in zip(bar_ob, z):
            rect.set_height(h)
        fig.canvas.draw()
        savefig('frame%04d.tif'%j)
        j += 1
        i += di
        if i == i_max:
            di = -1
            i += 2*di
        elif i == 0:
            break

main()

Bolide impacts

Now, let's look at our data on bolides in the same way.

[Show code]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python2


# dates of bolide observations from NASA between 1994 and 2013
# http://en.es-static.us/upl/2014/11/bolide_events.jpg
# http://neo.jpl.nasa.gov/fireball/
#
# These are "large".  Every day about 100 tons (90,000 kgs) of meteoroids --
# fragments of dust and gravel and sometimes even big rocks - enter the Earth's
# atmosphere.  Earth has a mass of about 5.972 x 10^24 kg.

from matplotlib.pyplot import *
from scipy import linspace, linalg, array, log, exp, loadtxt
import time

def loaddatedata():
    # translate date into seconds
    f1 = lambda s : int(time.mktime(time.strptime(s, '%Y-%b-%d')))
    # translate time-of-day into seconds
    f2 = lambda s : int(time.mktime(time.strptime(s, '%H:%M:%S')))

    x = loadtxt('data/meteors.txt', converters={0:f1, 1:f2 })

    # normalize, convert from seconds to days
    y = x[:,0] + x[:,1]
    y = (y - min(y))/(60*60*24.)
    y.sort()
    return y

def bin_data(y, n):
    tmin, tmax = min(y), max(y)
    print "Rough parameter estimate $\lambda=%f$"%((len(y)-1)/(tmax-tmin))
    dt = (tmax-tmin)/float(n)
    print "# bin size = %f days"%dt
    def g(t):
        return int( (t-tmin)/dt )
    h = [ g(i) for i in y]
    x,y = zip(*[(i,h.count(i)) for i in xrange(n)])
    return dt,x,y

def diff(x):
    return [ float(x[i]-x[i-1]) for i in range(1,len(x)) ]

def main():
    data = loaddatedata()
    dt, u, v = bin_data(data, 80)
    x, y = zip(*[ (i, v.count(i)) for i in range(min(v), max(v)+1)])

    figure(1, figsize=(9,6))

    z = diff(data)
    z.sort()
    subplot(1, 2, 1)
    data = array(data) - min(data)
    plot(data,'go')
    xlabel('Rank')
    ylabel('Day from start')
    title('Dates of bolide sitings')

    subplot(1, 2, 2)
    bar(x,y,color='b')
    ylim(0, 1.1*max(y))
    title('%d day bins'%dt)
    ylabel('Frequency')
    xlabel('Count')

    savefig('figures/meteorDataAndFit.pdf',bbox_inches='tight')
    savefig('figures/meteorDataAndFit.png',bbox_inches='tight')
    #show()

main()

The choice of bins is a bit arbitrary, which should make us uncomfortable, but if we just go with it, we can now look for the value of \(\lambda\) that leads to the closest-matching Poisson distribution.

[Show code]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python

"""
This script creates an animation showing how
a distribution can be fit to a data.

First, we simulate or load a data set.  Then we
compare the data to distributions over
a parameter range, and find the "best one"
"""

from scipy import floor, cumsum, array, linspace, cos, pi, rand, loadtxt
from scipy.stats import poisson
#from scipy.optimize  import minimize_scalar as fminbnd
from scipy.optimize import fminbound
from matplotlib.pyplot import *
import time
import matplotlib.animation as manimation

fminbnd = lambda a,b : fminbound(a,b[0],b[1])

def norm(x,e):
    return sum(abs(x)**e)**(1./e)

def samplefit(L_best, N, numsamples):
    r = poisson(L_best)
    M = 50
    ideal = poisson.pmf(range(M), L_best)*N
    for t in range(numsamples):
        s = list(r.rvs(N))
        smple = array([ s.count(i) for i in range(M) ])
        assert sum(smple) == N
        yield norm(ideal-smple, 1)/norm(ideal,1)
    return

def main():
    # load data
    data=[(0,3),(1,9),(2,13),(3,16),(4,12),(5,14),(6,7),(7,4),(8,2),(9,0),(10,0)]
    Dt = 45.82
    x, y = map(array, zip(*data))

    N = sum(y)

    Y = y*1./N
    # distribution fitting
    def fit_error(L_guess):
        # calculate the distance between our guess and the data
        z = poisson.pmf(x, L_guess)
        return norm(Y-z, 1)/norm(Y,1) # can be any p-norm
    L_best = fminbound(fit_error, 1e-5, 10-1e-5)
    err_best = fit_error(L_best)
    print "Best fit: L = %f,  error = %f"%(L_best, fit_error(L_best))

    # generate data for animation
    u, v = map(array, zip(*[
        (p/Dt,fit_error(p)) for p in linspace(1e-4,10-1e-4,137) ]))

    subplot(2, 1, 2)
    plot(u, v, 'b-')
    #plot( L_best, fit_error(L_best), 'ko')
    marker_fit_error = plot( u[2], v[2], 'ro').pop()
    ylim(0, 1.5)
    xlabel('Poisson intensity $\lambda$')

    subplot(2, 1, 1)
    plot(x, y, 'bo')
    z = poisson.pmf(x, L_best)
    width = 0.3
    bar_ob = bar(x - 0.5*width, z*N, width, color='r')
    title('N = %d observations, $\lambda_{best} = %.3f$, $Err_{best} = %1.2g$'%(N, L_best/Dt, err_best))
    ylabel('Number of times observed')
    xlabel('Number of successes')
    xlim(-0.5, max(x)+.5)
    ylim(0, max(y)*1.2)

    savefig('meteorbinfit.pdf',bbox_inches='tight')
    savefig('meteorbinfit.png',bbox_inches='tight')

    figure(2)
    subplot(2,1,1)
    sampleerr = [ e for e in samplefit(L_best, N, 200) ]
    sampleerr.sort()
    plot(sampleerr,'kx-')
    subplot(2,1,2)
    M = 16
    ideal = cumsum(poisson.pmf(range(M), L_best)*N)
    r = poisson(L_best)
    for t in range(30):
        s = list(r.rvs(N))
        smple = cumsum(array([ s.count(i) for i in range(M) ]))
        plot(ideal, smple,'ko-')
    plot(cumsum(z*N), cumsum(y),'ro-')
    show()

    return

main()

Poisson processes in two dimensions

Calculate the distances between all pairs of points, and rank these distances from smallest to largest.

[Show code]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/python2

from scipy import rand, zeros, sqrt, pi, array, arcsin, sign, linspace, real
from scipy import randn
from matplotlib.pyplot import *

def hvy(x):
    # Heaviside function
    return real((sign(x)+1.)*0.5)

def ripleyK(r):
    # Calculate Ripley's K statistic for a 1 x 1 torus
    S = sqrt(0.5)
    p = real(sqrt(4*r*r-1))
    q = real(arcsin(real(1-p)/(1e-19+r*sqrt(8))))
    y = hvy(r)*hvy(0.5-r)*pi*r*r
    #print hvy(0.5-r)*hvy(S-r)
    y += hvy(r-0.5)*hvy(S-r)*(p+4*r*r*q)
    y += hvy(r-S)
    return y

def calcpointdistances(x,y):
    turn = lambda t : min([abs(t-1), abs(t), abs(t+1)])
    n = len(x)
    assert len(y) == n
    for j in range(n):
        for k in range(j):
                u = turn(x[j]-x[k])
                v = turn(y[j]-y[k])
                yield sqrt(u*u+v*v)

def getnormaldata(numpts):
    x = randn(numpts)/5+0.5
    y = randn(numpts)/5+0.5
    dists = array([ i for i in calcpointdistances(x,y) ])
    dists.sort()
    return x, y, dists

def getpoissondata(numpts):
    x = rand(numpts)
    y = rand(numpts)
    dists = array([ i for i in calcpointdistances(x,y) ])
    dists.sort()
    return x, y, dists

def main():
    n = 83
    #x, y, distances = getnormaldata(n)
    x, y, distances = getpoissondata(n)

    baseline = linspace(0,1,len(distances))
    Area = ripleyK(distances)
    Err = Area - baseline
    toterr = sum(abs(Err))/sum(baseline)

    figure(1)
    plot(x, y, 'o')
    xlim(0,1)
    ylim(0,1)
    title('%d Poisson points in a plane'%n)
    savefig('ripley-planepts.pdf')
    savefig('ripley-planepts.png')

    figure(2)
    subplot(2,1,1)
    plot(distances, Area, 'r-', linewidth=2)
    plot(distances, baseline, 'b' )
    legend(['Predicted', 'Observed'])
    title('Prediction and observation for Ripley K statistic')
    ylabel('Cumulative Probability')

    subplot(2,1,2)
    plot(baseline, Err, 'g-' )
    title('%d Points, Relative Error = %.3f'%(n,toterr))
    ylabel('Residual vector over distance')
    xlabel('Distance from origin')

    savefig('ripley-test.pdf')
    savefig('ripley-test.png')


main()

There are More data sets like these that you can explore, out on the internet.