Homework 01: Numerical python and data handling#

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
import os.path
import subprocess
def wget_data(url):
    local_path='./tmp_data'
    subprocess.run(["wget", "-nc", "-P", local_path, url])
def locate_data(name, check_exists=True):
    local_path='./tmp_data'
    path = os.path.join(local_path, name)
    if check_exists and not os.path.exists(path):
        raise RuxntimeError('No such data file: {}'.format(path))
    return path

Problem 1#

Part a:

Use matrix multiplication in numpy to multiply x times y and y times x (recalling that matrix multiplication is not commutative).

# A correct solution should pass these tests.
x = np.arange(4).reshape(4,1)
y = np.arange(4).reshape(1,4)

# x times y


assert np.array_equal(
    xy,
    [[0, 0, 0, 0],
 [0, 1, 2, 3],
 [0, 2, 4, 6],
 [0, 3, 6, 9]])

# y times x

assert np.array_equal( yx, [[14]])

Part b

Calculate the cross product of vector \(\vec{x}\) with vector \(\vec{y}\). Show, using the dot product that the resulting vector is perpendicular to both vectors. Let \(\vec{x}\) = [3, 5, 3] and \(\vec{y}\) = [4, 7, 9].

Problem 2#

Use np.histogram to calculate the fraction of values in an arbitrary input data array that lie in each of the 10 intervals [0.0, 0.1), [0.1, 0.2), …, [0.9, 1.0). You can assume that all input values are in the range [0,1). This is a useful technique to estimate the probability density that the data was sampled from.

def estimate_probability_density(data, bins):
    """Estimate the probability density of arbitrary data.

    Parameters
    ----------
    data : array
        1D numpy array of random values.
    bins : array
        1D numpy array of N+1 bin edges to use. Must be increasing.

    Returns
    -------
    array
        1D numpy array of N probability densities.
    """
    assert np.all(np.diff(bins) > 0)

    # YOUR CODE HERE
    raise NotImplementedError()
# A correct solution should pass these tests.
generator = np.random.RandomState(seed=123)
data = generator.uniform(size=100)
bins = np.linspace(0., 1., 11)
rho = estimate_probability_density(data, bins)
assert np.allclose(0.1 * rho.sum(), 1.)
assert np.allclose(rho, [ 0.6,  0.8,  0.7,  1.7,  1.1,  1.3,  1.6,  0.9,  0.8,  0.5])

Problem 3#

Define a function to calculate the entropy \(H(\rho)\) of a binned probability density, defined as: $\( H(\rho) \equiv -\sum_i \rho_i \log(\rho_i) \Delta w_i \; , \)\( where \)\rho_i\( is the binned density in bin \)i\( with width \)w_i$.

def binned_entropy(rho, bins):
    """Calculate the binned entropy.

    Parameters
    ----------
    rho : array
        1D numpy array of densities, e.g., calculated by the previous function.
    bins : array
        1D numpy array of N+1 bin edges to use. Must be increasing.

    Returns
    -------
    float
        Value of the binned entropy.
    """
    assert np.all(np.diff(bins) > 0)

    # YOUR CODE HERE
    raise NotImplementedError()
# A correct solution should pass these tests.
generator = np.random.RandomState(seed=123)
data1 = generator.uniform(size=10000)
data2 = generator.uniform(size=10000) ** 4
bins = np.linspace(0., 1., 11)
rho1 = estimate_probability_density(data1, bins)
rho2 = estimate_probability_density(data2, bins)
H1 = binned_entropy(rho1, bins)
H2 = binned_entropy(rho2, bins)
assert np.allclose(H1, -0.000801544)
assert np.allclose(H2, -0.699349908)

Problem 4#

Define a function that reads pong_data.hf5 and returns a new subset DataFrame containing only the columns x5, y5, x7, y7 (in that order) and only the last 200 rows.

wget_data('https://courses.physics.illinois.edu/phys398dap/fa2023/data/pong_data.hf5')
def create_subset():
    """Read pong_data.hf5 and return a subset.
    """
    # YOUR CODE HERE
    raise NotImplementedError()
# A correct solution should pass these tests.
subset = create_subset()
assert np.array_equal(subset.columns.values, ('x5', 'y5', 'x7', 'y7'))
assert len(subset) == 200
summary = subset.describe()
assert np.allclose(summary.loc['mean', :].values,
                   [ 0.43564752,  0.30610958,  0.57520991,  0.21383226])

##Problem 5

Part a

Write a function which uses np.random to make a data frame with N rows and two columns, “index” which is just the row number and “y” which is a random value. Find all ordered pairs of y (i.e. 3,4 and 4,3 would be distinct pairs) and put them in a list. Do this two ways:

Part b

Run the function for N = 50. Show that the lists for the two methods are the same length and contain the same pairs.

Part c

Use the time package to measure the time for each method of generating permuations for N = 2, 5, 10, 100, 500, 1000, 5000.

Part d

Make a plot showing the run time (on the y-axis) for each method as a function of N (on the x-axis). Make sure to label the plot. What do you see?