""" Possible solutions to the first set of exercises from Mike Irwin. These only give a possible solution. I tried to vary the methods for calculations a bit to show different approaches. These are not necessarily the most efficient or most flexible solutions. You should be able to run this script in a terminal via: $:> python exercises.py """ # Import necessary modules: first the system modules import itertools import urllib import os # Then 3rd party modules import numpy as np from numpy import exp, sqrt, pi import matplotlib.pyplot as plt from scipy.misc import factorial # Then your own modules (but we don't have any) def download_data(): """ Download the data and the PDF with the exercises to the current folder. The download will only occur if a file does not exists, otherwise it will be skipped. A message is print to the screen. """ # Define the locations of the files uri1 = 'http://apm3.ast.cam.ac.uk/~mike/astrostats/cetus_fors.dat' uri2 = 'http://apm3.ast.cam.ac.uk/~mike/astrostats/scl_flames.dat' uri3 = 'http://apm3.ast.cam.ac.uk/~mike/astrostats/astrostats_E12.pdf' # Start an URLopener to download the files url = urllib.URLopener() # Cycle through the file locations and check if that file already exists # in the current working directory. If not, download; else, don't bother. for uri in [uri1, uri2, uri3]: local_file = os.path.basename(uri) if not os.path.isfile(local_file): print("Downloading {} to {}".format(local_file, os.getcwd())) filen, msg = url.retrieve(uri, filename=local_file) else: print("File {} already exists in {}".format(local_file, os.getcwd())) # Be sure to close the stream! url.close() def gambler(a=1, b=3, c_d=20): """ Solve the gambler's problem. """ c = np.arange(c_d+1) d = c_d - c # Simple - but wrong - approach P_c_simple = factorial(c_d) / (factorial(c) * factorial(d)) *\ a**c * b**d / (a+b)**c_d # Bayesian approach num = factorial(a+c) * factorial(b+d) * factorial(c+d) * factorial(a+b+1) denom = factorial(a) * factorial(b) * factorial(c) * factorial(d) * \ factorial(a+b+c+d+1) P_c_bayes = num / denom # For Fisher's approach, we need to deal with the large powers. Therefor, # we take the log of the probability function. At c==0 and c==c_d, the # probability drops to zero, giving us division and invalid errors from # numpy. We could treat these cases separately, avoiding zero-divisions # and such, but we choose to just ignore the warnings raised by numpy: previous_settings = np.seterr(divide='ignore', invalid='ignore') num = (a+c)*np.log(a+c) + (b+d)*np.log(b+d) + (c+d)*np.log(c+d) +\ (a+b)*np.log(a+b) denom = a*np.log(a) + b*np.log(b) + c*np.log(c) + d*np.log(d) +\ (a+b+c+d)*np.log(a+b+c+d) P_c_fisher = np.exp(num - denom) P_c_fisher[0] = 0. P_c_fisher[-1] = 0. P_c_fisher = P_c_fisher / np.sum(P_c_fisher) # But restore the error-reporting settings from numpy to their previous # values, perhaps this is necessary for other bits of the code np.seterr(**previous_settings) # Make a plot of the probabilities plt.figure() plt.title("Binomial prediction a={} b={} c+d={}".format(a, b, c_d)) plt.step(c, P_c_simple, where='mid', color='b', label='Simple') plt.step(c, P_c_bayes, where='mid', color='g', label='Bayes') plt.step(c, P_c_fisher, where='mid', color='r', label='Fisher') plt.legend() plt.xlabel("c") plt.ylabel("P(c)") plt.savefig('gambler') print("Gambler's problem: done") def velocity_dispersions(scale_error=1.0): """ Analyze the velocity dispersions in star clusters/dwarf galaxies. """ # Read in the data and extract the velocity and sigma columns data = np.loadtxt('cetus_fors.dat', skiprows=2) v_hel = data[:,3] verr = data[:,4] * scale_error # Construct a grid in esimated mean velocity and error vmean_1D = np.linspace(-90, -40, 120) sigma_1D = np.linspace(0, 35, 100) vmean, sigma = np.meshgrid(vmean_1D, sigma_1D) # Compute the log-likelihood sigma_total2 = verr[:,None,None]**2 + sigma[None,:,:]**2 term1 = np.log(sigma_total2) term2 = 0.5 * (v_hel[:,None,None] - vmean[None,:,:])**2 / sigma_total2 lnL = -0.5 * np.sum(term1, axis=0) - np.sum(term2, axis=0) # Make a figure of the log-likelihood map and plot some contours # First we want to know the best values lnL_max = lnL.max() lnL_argmax = lnL.argmax() v_best, sig_best = np.ravel(vmean)[lnL_argmax], np.ravel(sigma)[lnL_argmax] plt.figure(figsize=(10, 6)) plt.subplots_adjust(wspace=0.4) # Plot the map plt.subplot(121) plt.title("2D ML distribution") plt.imshow(lnL, extent=[vmean.min(), vmean.max(), sigma.min(), sigma.max()], origin='image', aspect='auto', cmap=plt.cm.spectral, vmin=lnL_max-50, vmax=lnL_max) # Give some info on the best values text = r"$\hat{{v}}={:.2f}$ km s$^{{-1}}$"+'\n'+"$\hat{{\sigma}}={:.2f}$ km s$^{{-1}}$" text = text.format(v_best, sig_best) plt.annotate(text, (0.9,0.9), xycoords='axes fraction', ha='right', va='top', fontsize='x-large') # Set the colorbar and give it a label cbar = plt.colorbar() cbar.set_label(r'$\ln L$', fontsize='x-large') # Plot the contours levels = [lnL_max, lnL_max-0.5, lnL_max-2, lnL_max-4.5] plt.contour(vmean, sigma, lnL, levels=levels, colors='k') # Set some labels plt.xlabel(r'$v_\mathrm{hel}$ km s$^{-1}$', fontsize='x-large') plt.ylabel(r'$\sigma_v$ km s$^{-1}$', fontsize='x-large') # Plot the marginalised distribution plt.subplot(122) plt.title("Marginalised distribution") lnL = lnL - lnL_max marginalised = np.exp(lnL).sum(axis=1) marginalised = marginalised / marginalised.max() plt.plot(sigma_1D, marginalised, 'k-') plt.gca().set_yscale('log') plt.ylim(2e-4, 1) plt.xlabel(r'$\sigma_v$ km s$^{-1}$', fontsize='x-large') plt.ylabel("Relative likelihood") plt.savefig("velocity_dispersion_{:.0f}".format(scale_error*100)) print("Velocity dispersion: done") # Return the best values, perhaps they come in handy at some point return v_best, sig_best def template_spectrum(wave, sigma, velo, locs=(8498.02, 8542.09, 8662.14), amps=(150, 250, 200.)): """ Build a template spectrum using a given wavelength range. sigma denotes the width of the specral lines. velo denotes the velocity """ # Shift the location according to the velocity cc = 299792.458 locs = np.array(locs) * (1.0 + velo/cc) # Make the template spectrum as a sum of Gaussians line = 0.0 for loc, amp in zip(locs, amps): line += -amp * np.exp( - 0.5*(wave-loc)**2 / (sigma**2)) return line def cross_correlation(): """ Compute the cross correlation. """ # Read the data, extract the columns, and renormalize the flux data = np.loadtxt('scl_flames.dat', skiprows=2) wave, flux, error, continuum, template = data.T flux = flux / continuum -1 # Define the grid in sigma and velocity sigma = np.linspace(0.25,2.25,100) velo = np.linspace(-500, 500, 100) # Prepare an array for the values of the cross-correlation profile phis = np.zeros( len(sigma)*len(velo) ) # Run over each combination of sigma and velocity in the grid for i, (sig, vel) in enumerate(itertools.product(sigma, velo)): # Get the template spectrum and normalise it T = template_spectrum(wave, sig, vel) norm = np.sqrt(np.sum(flux**2) * np.sum(T**2)) # Then get the value of the cross correlation profile phis[i] = np.sum(flux * T) / norm # Reshape the results in a nice grid phis = phis.reshape((100,100)) # Make a grid of the CCP, but summed over all sigmas plt.figure(figsize=(10,6)) plt.plot(velo, phis.mean(axis=0)) plt.axhline(0, linestyle='--', color='k') plt.xlim(-500,500) plt.xlabel(r"Velocity [km s$^{-1}$]") plt.ylabel("Cross-correlation") plt.grid() plt.savefig('cross_correlation_01') # Now try the same using the likelihood function # We'll take a slightly different grid sigma = np.linspace(0.25,2.25,100) velo = np.linspace(85, 115, 100) # But effectively do the same, i.e. compute the likelihood for every point # in our grid lnL = np.zeros( len(sigma)*len(velo) ) for i, (sig, vel) in enumerate(itertools.product(sigma, velo)): T = template_spectrum(wave, sig, vel) k_numerator = flux * T / error**2 k_denominator = T**2 / error**2 k = np.sum(k_numerator) / np.sum(k_denominator) lnL[i] = - np.sum( (flux - k*T)**2 / (2.0*error**2)) # Renormalize the loglikelihood lnL = lnL - lnL.max() # And make a figure plt.figure() lnL = lnL.reshape((100,100)) plt.imshow(lnL, extent=[velo[0],velo[-1], sigma[0],sigma[-1]], origin='image', aspect='auto', cmap=plt.cm.spectral, vmin=-0.003) plt.xlabel(r"$v_\mathrm{hel}$ km s$^{-1}$", fontsize='x-large') plt.ylabel(r"$\sigma_T$ [$\AA$]", fontsize='x-large') cbar = plt.colorbar() cbar.set_label("log Likelihood", fontsize='x-large') velo, sigma = np.meshgrid(velo, sigma) plt.contour(velo, sigma, lnL, colors='k', levels=np.linspace(-5e-4,0,10), linestyles='solid') plt.savefig('cross_correlation_02') if __name__ == "__main__": download_data() gambler() velocity_dispersions(scale_error=1.0) velocity_dispersions(scale_error=2.0) cross_correlation() plt.show()