Lecture 16 - Data and Lines

(previous, next)



Exploration -- Looking for patterns in data

Estimates of meteor-rate arrival:

But our rough estimate doesn't really use the data(!) and binning is rather arbitrary and doesn't seem to make the best use of all the data we have. Is there a better way?

Yes, we can exploit the expected cumulative distribution for our increments ...

(data file)

[Show code]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python2

"""
Fitting the increments between meteor observations
to an exponential distribution, and then estimating
the parameter of the exponential using least-squares.
"""

from matplotlib.pyplot import *
from matplotlib import rcParams
from scipy import linspace, linalg, array, log, exp, loadtxt
from scipy.optimize import fminbound
from scipy.linalg import norm
import time

def loaddatedata():
    # translate date into seconds
    f1 = lambda s : int(time.mktime(time.strptime(s, '%Y-%b-%d')))
    # translate time-of-day into seconds
    f2 = lambda s : int(time.mktime(time.strptime(s, '%H:%M:%S')))

    x = loadtxt('meteordates.txt', converters={0:f1, 1:f2 })

    # normalize, convert from seconds to days
    y = x[:,0] + x[:,1]
    y = (y - min(y))/(60*60*24.)
    y.sort()
    return y

def diff(x):
    return [ float(x[i]-x[i-1]) for i in range(1,len(x)) ]

def main():
    data = loaddatedata()
    L_best_TP = len(data)/(data[-1]-data[0])

    z = diff(data)
    n = len(z)
    z.sort()
    print (min(z), max(z))
    z = array(z)
    rank = array(range(1,n+1))

    # Approximate by least-squares
    A = array([[i] for i in z])
    b = log(1-rank/float(n+1))
    s = linalg.solve( A.T.dot(A), A.T.dot(b) )
    L_best_LSQ = -s[0]
    print L_best_LSQ

    # for the sake of argument, let's fit the probability
    # plot using nonlinear minimization (SLOW!!)
    def pErr(L):
        return norm(1-exp(-L*z)-rank/float(n),2)
    L_best_PP = fminbound(pErr,0.01,0.1)


    print "Time interval: ", data[0], data[-1], len(data), L_best_TP
    print "Linear Least Squares: ", L_best_LSQ
    print "Probability plot comparison: ", L_best_PP

    figure(1, figsize=(14,12))
    rcParams.update({'font.size':18})

    subplot(2, 2, 1)
    data = array(data) - min(data)
    plot(data, 'go')
    xlabel('Rank')
    ylabel('Day from start')
    title('Dates of bolide sightings')
    title('Linear Least Squares to CDF')
    text(150, 500, '$\lambda = %.4f$'%L_best_TP)

    subplot(2, 2, 2)
    plot(z, rank*1./n, 'bx')
    title('Cumulative increments between points')
    ylabel('Increment size')
    xlabel('Rank')
    xlim(0,max(z))

    subplot(2,2,3)
    #plot(u, 1-exp(-L_best*z)-u,'ro')
    plot(rank/float(n), 1-exp(-L_best_PP*z),'bx', rank/n, rank/n, 'k-')
    text(0.6, 0.3, '$\lambda = %0.4f$'%L_best_PP)
    title('Best Fit Probability plot')
    xlabel('Predicted Probability')
    ylabel('Observed Probability')

    subplot(2, 2, 4)
    semilogy(z, 1-rank/float(n), 'x')
    semilogy(z, exp(A.dot(s)), 'r-') # least-squares line
    ylim(2e-3, 1.)
    xlim(0, 70)
    xlabel('Time T (days)')
    ylabel('Probability $P(t > T)$')
    text(10, 1.e-2, '$\lambda = %.4f$'%L_best_LSQ)

    savefig('meteorIncrements.pdf',bbox_inches='tight')
    savefig('meteorIncrements.png',bbox_inches='tight')

main()

Linear Least squares

Theorem: The closest solution to \(A x = b\) solves \(A^T A s = A^T b\). This is also the best estimate for the true value if errors are normally distributed.

\(A^T A s = A^T b\) is called the "normal equation" of a linear system.

First, good to have a reference on Matrix calculus.

\[\begin{gather} E(x) := ||Ax-b|| _ 2 = (Ax - b)^T (Ax-b) = x^T A^T A x - x^T A b - b^T A x + b^T b \\ E(x) = x^T A^T A x - 2 b^T A x + b^T b \\ \frac{dE}{dx} = 2 x^T A^T A - 2 b^T A = 0 \\ A^T A x = A^T b. \end{gather}\]