import sys
sys.path.insert(0, '../')

from agasc_gaia import cross_match as xm
from agasc_gaia import datasets
import scipy.interpolate
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# a quick verification of the above, using example values and a simple Riemann sum.
sigma = 0.1
s = np.linspace(-3, 3, 12001)
ds = np.diff(s)
s = s[:-1] + ds/2
p = 1/(np.sqrt(2*np.pi)*sigma) * np.exp(-s**2/(2*sigma**2))
integral = np.sum(p * ds)
print(f"{integral=}, max_p={np.max(p):.3f}")

integral=1.0, max_p=3.989

values = np.random.choice(s, p=p * ds, size=10000)
p_values = 1 - 0.5*(scipy.special.erf(np.abs(values)/sigma/np.sqrt(2)) - scipy.special.erf(-np.abs(values)/sigma/np.sqrt(2)))

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
plt.sca(axes[0])
_, bins, _ = plt.hist(
    values,
    bins=np.linspace(-3, 3, 601),
    histtype='step',
    density=True,
)
plt.plot(s, p)
plt.xlim((-3*sigma, 3*sigma))
plt.sca(axes[1])
plt.hist(
    p_values,
    histtype='step',
);

prob = 1/(np.sqrt(2*np.pi)*sigma) * np.exp(-values**2/(2*sigma**2))

fig, axes = plt.subplots(1, 1, figsize=(6, 4))
plt.plot(prob, p_values, '.')
plt.xlabel('Probability')
plt.ylabel('p-value')

Text(0, 0.5, 'p-value')

# this is one way to draw a single value and calculate its p_value using the same example
indices = np.arange(s.size)
idx = np.random.choice(indices, p=p * ds)
s_0 = s[idx]
p_0 = p[idx]
p_value = np.sum(p[p < p_0] * ds[p < p_0])
p_value_2 = 1- 0.5*(scipy.special.erf(-s_0/sigma/np.sqrt(2)) - scipy.special.erf(s_0/sigma/np.sqrt(2)))
print(f"value: {s_0}")
print(f"{p_value=} (numerical)")
print(f"p_value={p_value} (analytical)")

value: -0.04125000000000001
p_value=0.6781415866630691 (numerical)
p_value=0.6781415866630691 (analytical)

def generate_sample(probability, range_1, range_2, n_sample=10000000, n_grid=100):
    """
    Generate a sample of n_sample values from the given distribution.
    
    This function is not great. The accuracy depends on having a small grid size, given by the
    number of points `n_grid`.
    """
    # we make a grid with sampling probabilities according to the given function
    d_mag = np.linspace(range_1[0], range_1[1], n_grid+1)
    d2d = np.linspace(range_2[0], range_2[1], n_grid+1)
    d_d_mag, d_d2d = np.diff(d_mag), np.diff(d2d)
    d_mag = (d_mag[1:] + d_mag[:-1]) / 2
    d2d = (d2d[1:] + d2d[:-1]) / 2
    d_mag, d2d = np.meshgrid(d_mag, d2d)
    d_d_mag, d_d2d = np.meshgrid(d_d_mag, d_d2d )
    d_area = d_d_mag * d_d2d


    p_match = probability(d_mag, d2d)
    norm_p_match = p_match / np.sum(p_match * d_area) 

    idx = np.random.choice(
        np.arange(len(norm_p_match.flatten())),
        size=n_sample,
        p=(norm_p_match / np.sum(norm_p_match)).flatten()
    )
    p_match_sample = p_match.flatten()[idx]
    return p_match_sample

def get_p_value_function_mc(sigma_1, sigma_2, n_grid=10000, n_sample=10000000):
    # generate a sample according to that distribution
    # this distribution has all weights equal to 1
    # as a result, the tails are not very well sampled.
    p_match_sample = generate_sample(
        lambda x, y: gaussian(x, y, sigma_1=sigma_1, sigma_2=sigma_2),
        (-20 * sigma_1, 20 * sigma_1),
        (-20 * sigma_2, 20 * sigma_2),
        n_sample=n_sample,
        n_grid=n_grid
    )

    # calculate the CDF
    eps = np.finfo(p_match_sample.dtype).eps
    bins = np.logspace(np.log(p_match_sample[p_match_sample>0].min()*(1-eps)), np.log(p_match_sample.max()*(1+eps)), 1000000)
    vals, bins = np.histogram(
        p_match_sample,
        bins=bins,
    )
    n = np.cumsum(vals)/np.sum(vals)
    # and make an interpolator to get the CDF value given the match probability
    x = (bins[1:] + bins[:-1]) / 2
    return scipy.interpolate.interp1d(
        x,
        n,
        fill_value=(np.min(n), np.max(n)),
        bounds_error=False
    )

def get_p_value_function_mc_weighted(sigma_1, sigma_2, n_grid=10000, n_sample=10000000, p_sample_min=1e-3):
    # we make a grid with sampling probabilities according to the given function
    n = n_grid
    m = 30 * sigma_1
    r = 30 * sigma_2
    d_mag, d2d = np.linspace(-m, m, 3*n+1), np.linspace(-r, r, n+1)
    d_d_mag, d_d2d = np.diff(d_mag), np.diff(d2d)
    d_mag = (d_mag[1:] + d_mag[:-1]) / 2
    d2d = (d2d[1:] + d2d[:-1]) / 2
    d_mag, d2d = np.meshgrid(d_mag, d2d)
    d_d_mag, d_d2d = np.meshgrid(d_d_mag, d_d2d )
    d_area = d_d_mag * d_d2d


    p_match = gaussian(d_mag, d2d, sigma_1=sigma_1, sigma_2=sigma_2)
    norm_p_match = p_match / np.sum(p_match * d_area)

    # now generate a sample according to that distribution
    # up-sample the tails, so the probability never goes below the given value
    p_sample_min = p_sample_min * norm_p_match.max()

    sample_prob = np.where(norm_p_match > p_sample_min, norm_p_match, p_sample_min)
    sample_weights = (norm_p_match / sample_prob)
    sample_prob = sample_prob / np.sum(sample_prob)
    idx = np.random.choice(
        np.arange(len(sample_prob.flatten())),
        size=n_sample,
        p=(sample_prob).flatten()
    )
    p_match_sample = p_match.flatten()[idx]
    p_match_sample_weights = sample_weights.flatten()[idx]
    # calculate the CDF
    eps = np.finfo(p_match_sample.dtype).eps
    # bins = np.linspace(p_match_sample.min()*(1-eps), p_match_sample.max()*(1+eps), 1000000)
    bins = np.logspace(np.log(p_match_sample[p_match_sample>0].min()*(1-eps)), np.log(p_match_sample.max()*(1+eps)), 1000000)
    vals, bins = np.histogram(
        p_match_sample,
        bins=bins,
        weights=p_match_sample_weights,
    )
    n = np.cumsum(vals)/np.sum(vals)
    # and make an interpolator to get the CDF value given the match probability
    x = (bins[1:] + bins[:-1]) / 2
    return scipy.interpolate.interp1d(
        x,
        n,
        fill_value=(np.min(n), np.max(n)),
        bounds_error=False
    )

# the 2d gaussian case is easy because we can integrate it,
# and the p-value is the integral from r to infinity, where
# r**2 = (x/sigma_1)**2 + (y/sigma_2)**2
# so p_value = np.exp(-0.5 * ((x/sigma)**2 + (y/sigma)**2))
# which happens to be the same function
# In other words: the function p_value = f(p_match) is the identity in the 2d-gaussian case
def gaussian(x, y, sigma_1=1, sigma_2=2):
    return np.exp(-0.5 * ((x/sigma_1)**2 + (y/sigma_2)**2))

sigma = 1

f = get_p_value_function_mc(sigma, sigma, n_grid=10000, n_sample=100000)
f2 = get_p_value_function_mc(sigma, sigma, n_grid=10000, n_sample=1000000)
f3 = get_p_value_function_mc_weighted(sigma, sigma, n_grid=1000, n_sample=10000)

x = np.linspace(0, 10, 50)
y = x
p_0 = gaussian(x, y, sigma_1=sigma, sigma_2=sigma)


fig, axes = plt.subplots(1, 2, figsize=(12, 4))
plt.sca(axes[0])
plt.plot(p_0, p_0, '-', label='identity')
plt.plot(p_0, f3(p_0), '.', label='weight (100000)')
plt.plot(p_0, f2(p_0), '.', label='no weight, 1000000')
plt.plot(p_0, f(p_0), '.', label='no weight, 100000')

plt.xscale('log')
plt.yscale('log')
plt.title(r'p-value function ($p_{value} = f(p_{match})$)')
plt.xlabel(r'$p_{match}$')
plt.ylabel(r'$p_{value}$')
plt.legend()

plt.sca(axes[1])
plt.plot(x, p_0, label='analytical')
plt.plot(x, f3(p_0), '.', label='weight (100000)')
plt.plot(x, f2(p_0), '.', label='no weight, 1000000')
plt.plot(x, f(p_0), '.', label='no weight, 100000')
plt.yscale('log')

plt.title(r'p-value Vs displacement')
plt.xlabel(r'$\Delta x$')
plt.ylabel(r'$p_{value}$')

plt.legend()
# plt.ylim((1e-6, 2))

<matplotlib.legend.Legend at 0x13609fd90>

sample = generate_sample(
    lambda x, y: gaussian(x, y, sigma_1=sigma, sigma_2=sigma),
    (-5 * sigma, 5 * sigma),
    (-5 * sigma, 5 * sigma),
    n_sample=1000000,
    n_grid=1000,
)

vals, bins, _ = plt.hist(
    sample,
    histtype='step',
)
print(np.mean(vals), np.std(vals) / np.sqrt(np.mean(vals)))

100000.0 0.7367808357985433

def get_p_value_function_mc():
    print("Getting p-value function")
    print("- Defining sampling space")
    n_grid = 3000  # this drives the time
    max_d_mag = 20
    d_mag = np.linspace(-max_d_mag, max_d_mag, 2 * n_grid + 1)
    # d2d = np.logspace(-4, 1, n_grid+1)  # this should not affect the result, but it does
    d2d = np.linspace(0, 10, n_grid + 1)
    d_d_mag, d_d2d = np.diff(d_mag), np.diff(d2d)
    d_mag = (d_mag[1:] + d_mag[:-1]) / 2
    d2d = (d2d[1:] + d2d[:-1]) / 2
    d_mag, d2d = np.meshgrid(d_mag, d2d)
    d_d_mag, d_d2d = np.meshgrid(d_d_mag, d_d2d)
    d_area = d_d_mag * d_d2d * d2d  # d2d is the 2d radius, so the area is approx r dr)

    print("- Calculating match probability")
    p_match = xm.agasc_gaia_match_probability(d_mag, d2d)
    norm_p_match = p_match / np.sum(p_match * d_area)

    print("- Generating Random sample")
    # now generate a sample according to that distribution
    # up-sample the tails, so the probability never goes below the given value
    p_sample_min = 1e-3
    p_sample_min = p_sample_min * norm_p_match.max()

    sample_prob = np.where(norm_p_match > p_sample_min, norm_p_match, p_sample_min)
    sample_weights = norm_p_match / sample_prob
    sample_prob = sample_prob / np.sum(sample_prob)  # need to normalize for np.random

    # in general, the number of samples determines how much we explore the tails and how close we
    # get to zero, but thanks to up-sampling, the tails will be sampled, so we do not need millions
    # of samples
    idx = np.random.choice(
        np.arange(len(norm_p_match.flatten())), size=100000, p=(sample_prob).flatten()
    )
    # d_mag_sample = d_mag.flatten()[idx]
    # d2d_sample = d2d.flatten()[idx]
    p_match_sample = p_match.flatten()[idx]
    p_match_sample_weights = sample_weights.flatten()[idx]
    eps = np.finfo(p_match_sample.dtype).eps
    print("- Calculating CDF")
    bins = np.logspace(
        np.log(p_match_sample[p_match_sample > 0].min() * (1 - eps)),
        np.log(p_match_sample.max() * (1 + eps)),
        1000000,  # the number of bins directly affects the lowest p_value we will get
    )
    vals, bins = np.histogram(p_match_sample, bins=bins, weights=p_match_sample_weights)
    x = (bins[1:] + bins[:-1]) / 2
    n = np.cumsum(vals) / np.sum(vals)
    return scipy.interpolate.interp1d(
        x, n, fill_value=(np.min(n), np.max(n)), bounds_error=False
    )

get_p_value_sample = get_p_value_function_mc()

Getting p-value function
- Defining sampling space
- Calculating match probability
- Generating Random sample
- Calculating CDF

get_p_value_1p8 = xm.get_p_value_function()

# Note that this probability can be larger than one, because this a probability density function
p_max = xm.agasc_gaia_match_probability(0.,0.)
print(p_max, get_p_value_sample(p_max))

3.2217620507337608 0.9994103897131031

def plot_p_value_contours():
    d_mag = np.linspace(-8, 8, 1001)
    d_angle = np.linspace(0, 4, 1001)

    d_mag, d_angle = np.meshgrid(d_mag, d_angle)
    p_match = xm.agasc_gaia_match_probability(d_mag, d_angle)
    p_value = get_p_value_1p8(p_match)

    contour = plt.contour(
        d_mag,
        d_angle,
        np.log(p_value),
        levels=np.log(np.array([0.00001, 0.0001, 0.001, 0.01, 0.02, 0.1, 0.99])),
        cmap='winter',
        vmax=3
    )

    def fmt(x):
        x = 100*np.exp(x)
        if x < 0.01:
            return f"{x:.3f}%"
        if x < 0.1:
            return f"{x:.2f}%"
        if x < 1:
            return f"{x:.1f}%"
        return f"{x:.0f}%"

    plt.clabel(contour, inline=1, fontsize=8, fmt=fmt)
    plt.xlabel("$\Delta$Mag")
    plt.ylabel("radial offset (arcsec)")
    plt.title("p-value contours")

def plot_p_value_v_mag(delta_angle=0):
    delta_mag = np.linspace(0, 8, 1000)
    p_match = xm.agasc_gaia_match_probability(delta_mag, delta_angle)
    plt.plot(delta_mag, get_p_value_sample(p_match))
    plt.yscale("log")
    plt.xlabel('Magnitude difference')
    plt.ylabel('p-value')
    plt.title('p-value Vs magnitude difference')


def plot_p_value_v_angle(delta_mag=0):
    delta_angle = np.linspace(0, 4, 1000)
    p_match = xm.agasc_gaia_match_probability(delta_mag, delta_angle)
    plt.plot(delta_angle, get_p_value_sample(p_match))
    plt.yscale("log")
    plt.xlabel('Angular separation (arcsec)')
    plt.ylabel('p-value')
    plt.title('p-value Vs angular separation')

fix, axes = plt.subplot_mosaic(
    [
        ["function", "2d"],
        ["angle", "mag"]
    ],
    figsize=(12, 8)
)
plt.sca(axes["function"])
x = np.linspace(0, p_max, 100)
plt.plot(x, get_p_value_sample(x), label="this notebook")
plt.plot(x, get_p_value_1p8(x), '--', label="1.8")
plt.xlabel('Match probability')
plt.ylabel('p-value')
plt.title('p-value function')
plt.legend()

plt.sca(axes["angle"])
plot_p_value_v_angle()

plt.sca(axes["mag"])
plot_p_value_v_mag()

plt.sca(axes["2d"])
plot_p_value_contours()
plt.tight_layout()

p_match = generate_sample(
    xm.agasc_gaia_match_probability,
    (-8, 8),
    (-3, 3),
    n_sample=10000,
    n_grid=5000,
)


p_value = get_p_value_sample(p_match)

vals, bins, _ = plt.hist(
    p_value,
    histtype='step',
)
print(np.mean(vals), np.std(vals) / np.sqrt(np.mean(vals)))

1000.0 0.8749857141690944

# agasc_direct_2 = xm.get_agasc_gaia_x_match()
agasc_direct = xm.get_agasc_gaia_x_match_difficult_fixed()
agasc_direct = agasc_direct[agasc_direct["best_match"]]

agasc_summary = datasets.get_agasc_summary()

i = np.searchsorted(agasc_summary["agasc_id"], agasc_direct["agasc_id"])
cols = [
    "class"
]
for col in cols:
    agasc_direct[col] = agasc_summary[col][i]

agasc_direct = agasc_direct[np.in1d(agasc_direct["class"], [0, 2, 6])]

# agasc_direct = agasc_direct[(~agasc_direct['d2d'].mask) & (~agasc_direct['d_mag'].mask)]
agasc_direct['p_match'] = xm.agasc_gaia_match_probability(agasc_direct['d_mag'], agasc_direct['d2d'])
agasc_direct['p_value'] = get_p_value_1p8(agasc_direct['p_match'])
agasc_direct['p_value_notebook'] = get_p_value_sample(agasc_direct['p_match'])

agasc_direct = agasc_direct[['d2d', 'd_mag', 'p_match', 'p_value', 'p_value_notebook']]

agasc_indirect = xm.get_agasc_tycho_gsc_gaia_x_match()

i = np.searchsorted(agasc_summary["agasc_id"], agasc_indirect["agasc_id"])
cols = [
    "class"
]
for col in cols:
    agasc_indirect[col] = agasc_summary[col][i]

agasc_indirect = agasc_indirect[np.in1d(agasc_indirect["class"], [0, 2, 6])]

# agasc_indirect = agasc_indirect[(~agasc_indirect['d2d'].mask) & (~agasc_indirect['d_mag'].mask)]
agasc_indirect['p_match'] = xm.agasc_gaia_match_probability(agasc_indirect['d_mag'], agasc_indirect['d2d'])
agasc_indirect['p_value'] = get_p_value_1p8(agasc_indirect['p_match'])
agasc_indirect['p_value_notebook'] = get_p_value_sample(agasc_indirect['p_match'])

agasc_indirect = agasc_indirect[['d2d', 'd_mag', 'p_match', 'p_value', 'p_value_notebook']]

bins = np.linspace(0, 5, 51)
# dx = np.diff(bins)
dx = 2 * np.pi * (bins[1:]**2 - bins[:-1]**2)
x = bins[:-1] + dx/2

plt.hist(
    agasc_direct['d2d'],
    bins=bins,
    density=True,
    histtype='step',
    label='direct'
)

plt.hist(
    agasc_indirect['d2d'],
    bins=bins,
    density=True,
    histtype='step',
    label='indirect'
)
x2 = np.linspace(0, 4, 1001)
dx2 = np.diff(x2)
x2 = x2[:-1] + dx2 / 2
p_d2d = xm.d2d_probability(x2)
p_d2d /= np.sum(dx2 * p_d2d)
plt.plot(x2, p_d2d)
plt.legend()
plt.yscale('log')
# plt.xlim((0, 1))

bins = np.linspace(0, 5, 51)
# dx = np.diff(bins)
dx = 2 * np.pi * (bins[1:]**2 - bins[:-1]**2)
x = bins[:-1] + dx/2

p = [0.89020772, 0.09891197, 0.00791296, 0.00296736]

plt.hist(
    agasc_indirect['d2d'],
    bins=bins,
    density=True,
    histtype='step',
    label='AGASC-Tycho2-GSC2.3-Gaia'
)
x2 = np.linspace(0, 4, 1001)
dx2 = np.diff(x2)
x2 = x2[:-1] + dx2 / 2
p_d2d = xm.d2d_probability(x2)
p_d2d /= np.sum(dx2 * p_d2d)
plt.plot(x2, p_d2d, color='r')
y = p[0] * xm.gaussian_d2d_probability_(x2, sigma_d2d=0.25)
plt.plot(x2[y>1e-5], y[y>1e-5], ":", color="r")
y = p[1] * xm.gaussian_d2d_probability_(x2, sigma_d2d=0.5)
plt.plot(x2[y>1e-5], y[y>1e-5], ":", color="r")
y = p[2] * xm.gaussian_d2d_probability_(x2, sigma_d2d=0.8)
plt.plot(x2[y>1e-5], y[y>1e-5], ":", color="r")
y = p[3] * xm.gaussian_d2d_probability_(x2, sigma_d2d=1.2)
plt.plot(x2[y>1e-5], y[y>1e-5], ":", color="r")
plt.legend()
plt.yscale('log')
plt.title("AGASC-Gaia Angular Separation")
plt.xlabel("Angular Separation (arcsec)")
# plt.xlim((0, 1))

Text(0.5, 0, 'Angular Separation (arcsec)')

plt.hist(
    agasc_direct['p_value'],
    bins=np.linspace(0, 1, 101),
    density=False,
    histtype='step',
    label='direct (AGASC-Gaia)'
)
plt.hist(
    agasc_indirect['p_value'],
    bins=np.linspace(0, 1, 101),
    density=False,
    histtype='step',
    label='indirect (AGASC-Tycho2-GSC2.3-Gaia)'
)
plt.legend()
# plt.axvline(0.006, color='r')
plt.xlabel('p-value')
plt.title("p-value distribution")
# plt.yscale('log')

Text(0.5, 1.0, 'p-value distribution')

plt.hist(
    agasc_direct['p_value'],
    bins=np.linspace(0, 0.05, 51),
    density=False,
    histtype='step',
    label='direct (AGASC-Gaia)'
)
plt.hist(
    agasc_indirect['p_value'],
    bins=np.linspace(0, 0.05, 51),
    density=False,
    histtype='step',
    label='indirect (AGASC-Tycho2-GSC2.3-Gaia)'
)
plt.legend()
# plt.axvline(0.02, color='r')
plt.axvline(0.006, linestyle='--', color='r')
# plt.axvline(0.002, linestyle=':', color='r')
plt.xlabel('p-value')
plt.title("p-value distribution")
plt.yscale('log')

print(f"""
len(agasc_direct),
p_value < 0.02:  {np.count_nonzero(agasc_direct['p_value'] < 0.02)/len(agasc_direct)*100:.2}%
p_value < 0.006: {np.count_nonzero(agasc_direct['p_value'] < 0.006)/len(agasc_direct)*100:.2}%
p_value < 0.002: {np.count_nonzero(agasc_direct['p_value'] < 0.002)/len(agasc_direct)*100:.2}%
""")

len(agasc_direct),
p_value < 0.02:  2.0%
p_value < 0.006: 1.0%
p_value < 0.002: 0.59%

#
sns.histplot(
    # x=agasc_direct['d2d'][agasc_direct["p_value"] > 0.02],
    # y=agasc_direct['d_mag'][agasc_direct["p_value"] > 0.02],
    x=agasc_direct['d2d'],
    y=agasc_direct['d_mag'],
    bins=[
        np.linspace(0, 5, 1000),
        np.linspace(-6, 6, 1000)
    ]
)

d_mag = np.linspace(-6, 6, 101)
d_angle = np.linspace(0, 3.5, 101)

d_mag, d_angle = np.meshgrid(d_mag, d_angle)
p_match = xm.agasc_gaia_match_probability(d_mag, d_angle)
p_value = get_p_value_1p8(p_match)

contour = plt.contour(
    d_angle,
    d_mag,
    p_value,
    # levels=np.array([0.002, 0.006, 0.02]),
    levels=np.array([0.006]),
    cmap='winter',
    vmax=3
)

plt.clabel(contour, inline=1)
plt.ylabel("Magnitude difference")
plt.xlabel("Angular separation (arcsec)")

Text(0.5, 0, 'Angular separation (arcsec)')

d2d_probability = xm.d2d_probability(agasc_direct['d2d'])

plt.hist(
    d2d_probability,
    bins=np.linspace(0, 1, 101),
    density=False,
    histtype='step',
    label='direct'
)

plt.legend()
plt.axvline(0.02, color='r')
# plt.yscale('log')

<matplotlib.lines.Line2D at 0x136c4b850>

#
sns.histplot(
    # x=agasc_direct['d2d'][agasc_direct["p_value"] > 0.02],
    # y=agasc_direct['d_mag'][agasc_direct["p_value"] > 0.02],
    x=agasc_direct['d2d'],
    y=agasc_direct['d_mag'],
    bins=[
        np.linspace(0, 5, 1000),
        np.linspace(-6, 6, 1000)
    ]
)

d_mag = np.linspace(-6, 6, 101)
d_angle = np.linspace(0, 3.5, 101)

d_mag, d_angle = np.meshgrid(d_mag, d_angle)
p_match = xm.agasc_gaia_match_probability(0, d_angle)
p_value = get_p_value_1p8(p_match)

contour = plt.contour(
    d_angle,
    d_mag,
    p_match,
    levels=np.array([0.002, 0.006, 0.02]),
    cmap='winter',
    vmax=3
)

plt.clabel(contour, inline=1)

<a list of 3 text.Text objects>

# agasc_difficult = utils.Cache.load('data/agasc-gaia-x-match-difficult.h5')
agasc_difficult = xm.get_agasc_gaia_x_match_difficult()

plt.hist(
    agasc_direct['p_value'],
    bins=np.linspace(0, 1, 101),
    # density=True,
    histtype='step',
    label='all'
)
plt.hist(
    agasc_difficult['p_value'][agasc_difficult['best_match']],
    bins=np.linspace(0, 1, 101),
    # density=True,
    histtype='step',
    label='difficult'
)
plt.legend()
# plt.axvline(0.02, color='r')
plt.yscale('log')

# some stars have NaN p-value but are nonetheless best matches.
# This is because d_mag is NaN. Might be good to look into it.
agasc_difficult[np.isnan(agasc_difficult['p_value']) & agasc_difficult['best_match']][["agasc_id", "gaia_id", "group", "d2d", "d_mag"]]

# the contents of this section are implemented in the following function of the cross_match module
# This is a smooth interpolation of the d2d distribution in the direct AGASC-Gaia matches:
_, direct_d2d = xm.smooth_d2d_probability(agasc_direct['d2d'])

# and this is the same for the indirect matches
indirect_d2d = xm.d2d_probability_interpolated

# If one uses `density=True` in the call to `plt.hist`, one ends up plotting `r * p(dr, theta)`
# whereas `xm.d2d_probability` is `p(dr, theta)`. The biggest difference is around zero.
# everything looked ok above because of a convenient choice of binning.

bins = np.linspace(0, 5, 801)
# dx = 2 * np.pi * (bins[1:]**2 - bins[:-1]**2)  # dx in cylindrical coordinates integrated over angle
# x = bins[:-1] + dx/2

plt.hist(
    agasc_indirect['d2d'],
    bins=bins,
    density=True,
    histtype='step',
)
x2 = np.linspace(0, 4, 1001)
dx2 = np.diff(x2)
x2 = x2[:-1] + dx2 / 2
p_d2d = xm.d2d_probability(x2)
p_d2d /= np.sum(dx2 * p_d2d)
plt.plot(x2, p_d2d, label="Current implementation")

v = x2 * xm.d2d_probability_interpolated(x2)
v = v / np.sum(v * dx2)
plt.plot(x2, v, label="Smooth interpolation")

plt.xlabel("r (arcsec)")
plt.ylabel("r p(r)")

plt.legend()
plt.xlim((0, 1))
# plt.yscale('log')
# plt.ylim((0.01, 5))

bins = np.logspace(-5, 1, 501)
vals, _ = np.histogram(
    agasc_indirect['d2d'],
    bins=bins,
)
vals = vals / np.diff(bins) / len(agasc_indirect)

plt.step(
    bins,
    np.concatenate([[0], vals]),
    label='indirect'
)

kde = scipy.stats.gaussian_kde(agasc_indirect['d2d'])

b = np.logspace(-4, 0, 40)
plt.plot(b, kde(b), label='kde')

plt.xlabel("r (arcsec)")
plt.ylabel("r p(r)")

plt.xscale('log')

bins = np.logspace(-5, 1, 501)
vals, _ = np.histogram(
    agasc_indirect['d2d'],
    bins=bins,
)
vals = vals / np.diff(bins) / len(agasc_indirect)

plt.step(
    bins,
    np.concatenate([[0], vals]),
)

kde2 = scipy.stats.gaussian_kde(np.log(agasc_indirect['d2d']))
kde = scipy.stats.gaussian_kde(agasc_indirect['d2d'])
b = bins[:]

y2 = kde2(np.log(b)) / (b)
y = kde(b)

plt.plot(b, y, label='kde')
plt.plot(b, y2, label='kde (log scale)')

plt.xlabel("r (arcsec)")
plt.ylabel("r p(r)")

plt.xscale('log')
plt.legend()

<matplotlib.legend.Legend at 0x1419a3250>

def multi_sigmoid(x, grid, scales, widths):
    grid = np.asanyarray(grid)
    scales = np.asanyarray(scales)
    widths = np.asanyarray(widths)
    x, grid, scales, widths = np.broadcast_arrays(x[None,:], grid[:,None], scales[:,None], widths[:,None])
    return np.sum(
        scales / (1 + np.exp((x - grid) / widths)),
        axis=0
    )


def fit_multi_sigmoid(grid, x, y):
    grid = np.asarray(grid, dtype=np.float64)
    widths = np.ones(len(grid)) * np.diff(grid)[0] / 2
    scales = np.ones(len(grid), dtype=np.float64) * np.mean(y)

    def func(x, *params):
        scales = params[:]
        return multi_sigmoid(x, grid, scales, widths)

    bounds = [(0, np.max(y))] * len(grid)
    bounds = list(zip(*bounds))
    params, covariance = scipy.optimize.curve_fit(
        func,
        x,
        y,
        p0=np.concatenate([scales]),
        bounds=bounds,
    )
    scales = params[:]
    return grid, scales, widths

kde2 = scipy.stats.gaussian_kde(np.log(agasc_indirect['d2d']))

x = np.logspace(-4, 1, 251)
# there are two factors of x in the denominator
# - one from the jacobian of the transformation (d log(r)/dr = 1/r)
# - one from the volume element, because the density from the KDE is `p(r) r` but we want p(r)
y = kde2(np.log(x)) / x**2

log_y = np.log(y)
offset = np.min(log_y)
grid, scales, widths = fit_multi_sigmoid(np.log(np.logspace(-3, 1, 20)), np.log(x), log_y - offset)

y2 = np.exp(multi_sigmoid(np.log(x), grid, scales, widths) + offset)

kde = scipy.interpolate.interp1d(
    x,
    y,
    fill_value=(y[0], 0),
    bounds_error=False
)
prob = scipy.interpolate.interp1d(
    x,
    y2,
    fill_value=(y[0], 0),
    bounds_error=False
)

plt.plot(kde.x, kde.y)
# plt.plot(x, y2)
plt.plot(prob.x, prob.y)
plt.xlabel("r (arcsec)")
plt.ylabel("p(r)")
plt.xscale('log')
# plt.yscale("log")

bins = np.logspace(-5, 1, 501)
vals, _ = np.histogram(
    agasc_indirect['d2d'],
    bins=bins,
)

vals = vals / np.diff(bins) / len(agasc_indirect)

plt.step(
    bins,
    np.concatenate([[0], vals]),
    label='indirect'
)

plt.plot(bins, bins * prob(bins), label='fit')
plt.yscale('log')
plt.xscale('log')
plt.ylim((1e-7, 1e1))
plt.xlabel("r (arcsec)")
plt.ylabel("r p(r)")

Text(0, 0.5, 'r p(r)')

plt.plot(x, log_y - offset)

yy = multi_sigmoid(np.log(x), grid, scales, widths)
plt.plot(x, yy)

for i in range(len(grid)):
    yy = multi_sigmoid(np.log(x), grid[i:i+1], scales[i:i+1], widths[i:i+1])
    plt.plot(x, yy, label=f"{grid[i]:.2f}")

plt.xscale('log')
# plt.yscale("log")
plt.ylim((1e-6, 1e2))
plt.xlabel("r (arcsec)")
plt.ylabel("multi-sigmoid(r)")

Text(0, 0.5, 'multi-sigmoid(r)')

def sigmoid(x, width, grid):
    return 1 / (1 + np.exp((x - grid) / width))

i = 0
width = widths[i]
grid = grid[i]
scale = scales[i]
plt.plot(np.log(x), scale * sigmoid(np.log(x), width, grid))

plt.axvline(grid, linestyle="--", linewidth=0.5, color='b')
plt.axhline(0.5 * scale, linestyle="--", linewidth=0.5, color='b')

plt.axvline(grid + width, linestyle=":", linewidth=1, color='orange')
plt.axhline(0.2689414213699951 * scale, linestyle=":", linewidth=1, color='orange')
plt.axvline(grid - width, linestyle=":", linewidth=1, color='orange')
plt.axhline((1 - 0.2689414213699951) * scale, linestyle=":", linewidth=1, color='orange')
plt.axhline(scale, linewidth=0.5, color='k')
plt.axhline(0, linewidth=0.5, color='k')
plt.title("Sigmoid")

Text(0.5, 1.0, 'Sigmoid')

bins = np.linspace(0, 5, 801)
dx = 2 * np.pi * (bins[1:]**2 - bins[:-1]**2)
x = bins[:-1] + dx/2

vals, _, _ = plt.hist(
    agasc_direct['d2d'],
    bins=bins,
    density=True,
    histtype='step',
    label='direct',
    color="tab:blue",
    linewidth=1,
)

plt.hist(
    agasc_indirect['d2d'],
    bins=bins,
    density=True,
    histtype='step',
    label='indirect',
    color="tab:orange",
    linewidth=1,
)
x2 = np.linspace(0, 4, 1001)
x2 = x2[:-1] + np.diff(x2) / 2

v = x2 * indirect_d2d(x2)
v = v / np.sum(v * dx2)
plt.plot(
    x2, v,
    color="tab:orange",
    linewidth=1.5,
)

v = x2 * direct_d2d(x2)
v = v / np.sum(v * dx2)
plt.plot(
    x2, v,
    color="tab:blue",
    linewidth=1.5,
)

plt.xlabel("r (arcsec)")
plt.ylabel("r p(r)")
plt.legend()
plt.xlim((0, 1))
# plt.yscale('log')
# plt.ylim((0.01, 5))

(0.0, 1.0)

agasc_id	gaia_id	group	d2d	d_mag
int64	int64	int64	float16	float16
86509569	37272589483308032	516	1.329	nan
120062593	1179937643468394752	667	0.1276	nan
173814857	3375357902495202048	915	0.2021	nan
190317945	3942119175534853504	1009	0.259	nan
193987737	1237442032881520128	1018	1.021	nan
226755120	2849245093298109312	1185	4.734	nan
230838097	105599987245451136	1209	3.352	nan
258343177	739577562896410624	1333	0.2234	nan
263982249	1283763839283936384	1353	0.182	nan
301728809	318144965102357376	1507	0.1121	nan
...	...	...	...	...
807803705	6281173616368185472	3761	0.8857	nan
853674521	2897984553971988736	3970	0.00406	nan
881722433	6176244992638943744	4090	0.11334	nan
928386985	2884234508231103616	4322	0.363	nan
1008360913	5433176789571159168	4758	0.3652	nan
1041766425	6693685472591970176	5010	0.1804	nan
1053300329	4929508139428270464	5036	0.1406	nan
1171147665	5296497525623305216	5949	0.05103	nan
1196039553	6491098908986711168	6292	0.0652	nan
1196432633	6393816662780744448	6294	0.146	nan

The 1d Gaussian Case¶

The 2d Gaussian Case¶

The Actual Function Used¶

p-values in Data¶

The match probability¶

The p-value distribution¶

Looking at "Difficult" Stars¶

Better p-match¶