This notebook is the second notebook to choose a magnitude model

It was used to compare two fitting strategies:

  • PCA this is the default model, which uses a principal component decomposition, and then a linear term of the first principal component plus a polynomial of the second principal component. It is of the form $m_{ACA} = A C_0 + Pol(C_1)$

  • simple color model, which is of the form

    $m_{ACA} = m_g + Pol(m_{rp} - m_{bp})$.

The simple model is intended to guarantee the scaling:

$m_{ACA}(m_g + k, m_{rp} + k, m_{bp} + k) = m_{ACA}(m_g, m_{rp}, m_{bp}) + k$

The main difference with default the PCA model is that the first term has a slope different than 1 in the PCA case, and equal to one in the simple color case.

NOTE¶

This is another possible model that I doubt will improve anything.

  • scaling model, which is of the form

    $m_{ACA} = \vec{m} \cdot \vec{p} + Pol(\vec{m} \cdot \vec{q})$,

    with the mean-subtracted magnitude vector $\vec{m} = (m_g - \langle m_g \rangle, m_{rp} - \langle m_{rp} \rangle, m_{bp} - \langle m_{bp} \rangle)$, and $\vec{p}$ and $\vec{q}$ chosen so $(p_0 + p_1 + p_2) = 1$ and $(q_0 + q_1 + q_2) = 0$. This is done by choosing $\vec{p}$ along the first principal component (eigenvector $\vec{v}_0$), and $\vec{q}$ along the projection of the second principal component (eigenvector $\vec{v}_1$) on the plane perpendicular to $\vec{u}_1 := \frac{1}{\sqrt{3}}(1, 1, 1)$:

    $\vec{p} = \frac{\vec{v}_0}{\sum_i {v}_{0 i}}$

    $\vec{q} = \vec{v_1} - (\vec{v_1} \cdot \vec{u}_1)\vec{u}_1$

In [1]:
import importlib
from astropy.table import Table

import pandas as pd
import seaborn as sns

import matplotlib
matplotlib.rcParams['font.size'] = 22
%matplotlib inline

sns.set_style("whitegrid")

pd.options.display.float_format = '{:,.3f}'.format
from agasc_gaia.gaia_model import get_missing_value_filler
from agasc_gaia import gaia_model
from agasc_gaia import datasets as ds
# %matplotlib inline
In [2]:
color_bins = np.linspace(-1, 5, 21)
mag_bins = np.linspace(0, 20, 41)
In [3]:
impute_train, _ = ds.get_gaia_min_bias_train_test()
missing_value_filler = get_missing_value_filler()
In [4]:
def make_profile(
        array,
        x_columns, y_columns,
        bins,
        other_columns=[]
):
    array = array.copy()
    assert isinstance(array, pd.DataFrame)
    if not isinstance(bins, list) or isinstance(bins, tuple):
        bins = [bins]
    if not isinstance(y_columns, list) or isinstance(y_columns, tuple):
        y_columns = [y_columns]
    if not isinstance(x_columns, list) or isinstance(x_columns, tuple):
        x_columns = [x_columns]
    assert len(bins) == len(x_columns)
    for x_col, col_bins in zip(x_columns, bins):
        array[f'__{x_col}_bin'] = np.digitize(array[x_col], col_bins)
    g = array.groupby([f'__{col}_bin' for col in x_columns])
    res = pd.DataFrame()
    res["n"] = g[y_columns[0]].count()
    for x_col in x_columns:
        res[f'{x_col}_mean'] = g[x_col].mean()
        res[f'{x_col}_std'] = g[x_col].std()
        res[f'{x_col}_min'] = g[x_col].min()
        res[f'{x_col}_max'] = g[x_col].max()
    for col in other_columns:
        res[f'{col}_mean'] = g[col].mean()
        res[f'{col}_std'] = g[col].std()
        res[f'{col}_min'] = g[col].min()
        res[f'{col}_max'] = g[col].max()
    for y_col in y_columns:
        res[f'{y_col}_mean'] = g[y_col].mean()
        res[f'{y_col}_std'] = g[y_col].std()
    res.index.names = [f'{x_col}_bin' for x_col in x_columns]
    return res
In [5]:
train_data = Table.read('data/agasc-gsc-tycho-gaia-x-match-train.fits')

train_data['mag_bin'] = np.digitize(train_data['ave_gaia_mag'], mag_bins)

sel = (
    (np.abs(train_data['mag_aca'] - train_data['ave_gaia_mag']) < 5)
    & (train_data['neighbor_mag_weight'] > 0.1)
    & (train_data['n_mag_neighbors'] == 1)
    & ~train_data['phot_variable_flag']
    & (train_data['has_mag'] > 0)
)
train_data = train_data[sel]

train_data['color'] = train_data['bp_mag'] - train_data['rp_mag']
train_data['color_bin'] = np.digitize(train_data['color'], color_bins)
train_data['mag_bin'] = np.digitize(train_data['g_mag'], mag_bins)

train_data = train_data.to_pandas()

test_data = Table.read('data/agasc-gsc-tycho-gaia-x-match-test.fits')
test_data['mag_bin'] = np.digitize(test_data['ave_gaia_mag'], mag_bins)
sel = (
    (np.abs(test_data['mag_aca'] - test_data['ave_gaia_mag']) < 5)
    & (test_data['neighbor_mag_weight'] > 0.1)
    & (test_data['n_mag_neighbors'] == 1)
    & ~test_data['phot_variable_flag']
    & (test_data['has_mag'] > 0)
)
test_data = test_data[sel]

test_data['color'] = test_data['bp_mag'] - test_data['rp_mag']
test_data['color_bin'] = np.digitize(test_data['color'], color_bins)
test_data['mag_bin'] = np.digitize(test_data['g_mag'], mag_bins)

test_data = test_data.to_pandas()
In [6]:
# train_data = train_data[np.abs(train_data['color']) < 0.5]
In [7]:
# importlib.reload(gaia_model)
deg = 2
model_1 = gaia_model.GaiaModel(
    missing_value_filler=get_missing_value_filler(),
    degree=deg,
    use_weights=False,
    fit_uncertainty=True,
    model="PCA",
    with_instrument_bias=True,
    mean_mag_threshold=8.5,
    exclude_outliers=True,
)
model_1.fit(train_data, impute_train.to_pandas())

model_2 = gaia_model.GaiaModel(
    missing_value_filler=get_missing_value_filler(),
    degree=deg,
    use_weights=False,
    fit_uncertainty=True,
    model="simple_color",
    with_instrument_bias=True,
    mean_mag_threshold=8.5,
    exclude_outliers=True,
)
model_2.fit(train_data, impute_train.to_pandas())

train_data['mag_aca_pred_1'] = model_1.predict(train_data)
train_data['mag_aca_pred_unbiased_1'] = model_1.predict(train_data, with_instrument_bias=False)
train_data['mag_aca_bias_1'] = train_data['mag_aca_pred_1'] - train_data['mag_aca_pred_unbiased_1']
train_data['mag_aca_pred_err_1'] = model_1.uncertainty(train_data)
train_data['mag_aca_res_1'] = train_data['mag_aca_pred_1'] - train_data['mag_aca_obs']

train_data['mag_aca_pred_2']  = model_2.predict(train_data)
train_data['mag_aca_pred_err_2'] = model_2.uncertainty(train_data)
train_data['mag_aca_res_2'] = train_data['mag_aca_pred_2'] - train_data['mag_aca_obs']

test_data['mag_aca_pred_1'] = model_1.predict(test_data)
test_data['mag_aca_pred_unbiased_1'] = model_1.predict(test_data, with_instrument_bias=False)
test_data['mag_aca_bias_1'] = test_data['mag_aca_pred_1'] - test_data['mag_aca_pred_unbiased_1']
test_data['mag_aca_pred_err_1'] = model_1.uncertainty(test_data)
test_data['mag_aca_res_1'] = test_data['mag_aca_pred_1'] - test_data['mag_aca_obs']

test_data['mag_aca_pred_2']  = model_2.predict(test_data)
test_data['mag_aca_pred_err_2']  = model_2.uncertainty(test_data)
test_data['mag_aca_res_2'] = test_data['mag_aca_pred_2'] - test_data['mag_aca_obs']
# making histograms with missing data makes my life harder
filled_train_data = missing_value_filler.fill_missing_values(train_data)
filled_test_data = missing_value_filler.fill_missing_values(test_data)
filled_test_data = filled_test_data[~np.isnan(filled_test_data['g_mag'])]


res_v_mag = make_profile(
    filled_train_data,
    'g_mag',
    ['mag_aca_res_1', 'mag_aca_res_2', 'mag_aca_bias_1'],
    mag_bins,
    other_columns=['mag_aca_obs', 'mag_aca']
)
res_v_color = make_profile(
    filled_train_data,
    'color',
    ['mag_aca_res_1', 'mag_aca_res_2', 'mag_aca_bias_1'],
    color_bins,
    other_columns=['mag_aca_obs', 'mag_aca']
)
res_v_mag_color = make_profile(
    filled_train_data,
    y_columns=['mag_aca_res_1', 'mag_aca_res_2', 'mag_aca_bias_1'],
    x_columns=['g_mag', 'color'],
    bins=[mag_bins, color_bins],
    other_columns=['mag_aca_obs', 'mag_aca'],
)
In [8]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
plt.sca(axes[0])
plt.errorbar(
    res_v_mag['mag_aca_mean'],
    res_v_mag['mag_aca_res_1_mean'],
    yerr=res_v_mag['mag_aca_res_1_std'] / np.sqrt(res_v_mag['n']),
    fmt='o',
    label='Model 1',
)
plt.errorbar(
    res_v_mag['mag_aca_mean']+0.01,
    res_v_mag['mag_aca_res_2_mean'],
    yerr=res_v_mag['mag_aca_res_2_std'] / np.sqrt(res_v_mag['n']),
    fmt='o',
    label='Model 2',
)
plt.legend()
plt.ylim((-0.2, 0.2))

plt.sca(axes[1])
plt.errorbar(
    res_v_color['color_mean'],
    res_v_color['mag_aca_res_1_mean'],
    yerr=res_v_color['mag_aca_res_1_std'] / np.sqrt(res_v_color['n']),
    fmt='o',
    label='Model 1',
)
plt.errorbar(
    res_v_color['color_mean']+0.01,
    res_v_color['mag_aca_res_2_mean'],
    yerr=res_v_color['mag_aca_res_2_std'] / np.sqrt(res_v_color['n']),
    fmt='o',
    label='Model 2',
)
plt.legend()
plt.ylim((-0.2, 0.2))
Out[8]:
(-0.2, 0.2)
No description has been provided for this image
In [9]:
fig, axes = plt.subplot_mosaic(
    [['color', '.'], ['2d', 'mag']],
    # sharex=True,
    # sharey=True,
    layout="constrained",
    height_ratios=[1, 4],
    width_ratios=[3, 1],
    figsize=(8, 10)
)

axes['2d'].sharex(axes['color'])
axes['2d'].sharey(axes['mag'])

axes['color'].tick_params(labelbottom=False)
axes['mag'].tick_params(labelleft=False)

plt.sca(axes['2d'])
sns.histplot(
    x=filled_train_data['color'],
    y=filled_train_data['g_mag'],
)
plt.sca(axes['mag'])
sns.histplot(
    y=filled_train_data['g_mag'],
)
# plt.ylabel('')
plt.sca(axes['color'])
sns.histplot(
    x=filled_train_data['color'],
)
# plt.xlabel('')
Out[9]:
<AxesSubplot: label='color', xlabel='color', ylabel='Count'>
No description has been provided for this image
In [10]:
n = len(filled_train_data)
frac = np.count_nonzero(
    (np.abs(filled_train_data.mag_aca_res_1) < filled_train_data.mag_aca_pred_err_1)
) / n
print(f"1-sigma coverage: {100*frac:.1f}%")
1-sigma coverage: 77.4%

The following is a collection of residual histograms, one per magnitude bin.

We can see that the simple color model deviates at low magnitudes by about 0.05 mag.

The dashed lines denote the uncertainty. The black dashed lines are the uncertainty of the model below mag 8.5 (without instrument effects), and the red lines denote the uncertainty above 8.5 (the total uncertainty, with instrument effects).

The total uncertainty is a bit larger than it would be in the gaussian case, since we have fat tails. To estimate how big is the effect, I calculated the coverage (the fraction of stars with residual less than its estimated uncertainty). In the gaussian case, this would be about 68%. In this case it is 77% on average.

In [11]:
width = 15
aspect = 1.5
cols = 3
mag_bin_indices = np.unique(filled_train_data.mag_bin)
mag_bin_indices = [idx for idx in mag_bin_indices if np.count_nonzero(filled_train_data.mag_bin == idx) > 20]
# mag_bin_indices = [13, 14, 15, 16]
y_res_bins = np.linspace(-.2, .2, 80)
# y_res_bins = np.linspace(-10., 10., 4000)


rows = len(mag_bin_indices) // cols + (1 if len(mag_bin_indices) % cols else 0)
fig, axes = plt.subplots(
    rows, cols, figsize=(width, rows*(width/cols/aspect)), sharex=True, sharey=True)
axes = axes.flatten()

for i, mag_bin_idx in enumerate(mag_bin_indices):
    ones = np.ones(np.count_nonzero(filled_train_data.mag_bin == mag_bin_idx))
    plt.sca(axes[i])
    vals, _ = np.histogram(
        filled_train_data.mag_aca_res_1[filled_train_data.mag_bin == mag_bin_idx],
        bins=y_res_bins
    )
    weight = ones/np.max(vals) if np.max(vals) else ones
    n, _, _ = plt.hist(
        filled_train_data.mag_aca_res_1[filled_train_data.mag_bin == mag_bin_idx],
        bins=y_res_bins,
        histtype='step',
        # density=True,
        weights=weight,
        label='PCA',
        linewidth=2
    )
    vals, _ = np.histogram(
        filled_train_data.mag_aca_res_2[filled_train_data.mag_bin == mag_bin_idx],
        bins=y_res_bins
    )
    weight = ones/np.max(vals) if np.max(vals) else ones
    n, _, _ = plt.hist(
        filled_train_data.mag_aca_res_2[filled_train_data.mag_bin == mag_bin_idx],
        bins=y_res_bins,
        histtype='step',
        # density=True,
        weights=weight,
        label='Simple Color',
        linewidth=2
    )
    mag = np.mean(filled_train_data.ave_gaia_mag[filled_train_data.mag_bin == mag_bin_idx])
    plt.axvline(-np.sqrt(model_1._base_variance), color='k', linestyle='--')
    plt.axvline(np.sqrt(model_1._base_variance), color='k', linestyle='--')
    plt.axvline(-np.sqrt(model_2._base_variance) + 0.001, color='r', linestyle='--')
    plt.axvline(np.sqrt(model_2._base_variance) + 0.001, color='r', linestyle='--')

    plt.axvline(0, color='k', linestyle='-')
    uncertainty = np.mean(filled_train_data.mag_aca_pred_err_1[filled_train_data.mag_bin == mag_bin_idx])
    plt.axvline(-uncertainty, color='r', linestyle=':')
    plt.axvline(uncertainty, color='r', linestyle=':')
    n = np.count_nonzero(filled_train_data.mag_bin == mag_bin_idx)
    frac = np.count_nonzero(
        (filled_train_data.mag_bin == mag_bin_idx)
        & (np.abs(filled_train_data.mag_aca_res_1) < filled_train_data.mag_aca_pred_err_1)
    ) / n
    plt.text(0.05, 0.8, f'$\sigma$ = {uncertainty:.3f}\ncoverage = {100*frac:.0f}%', fontsize='xx-small')
    plt.title(f'ave_gaia_mag = {mag:.2f}', fontsize='x-small')

# for ax in axes[::cols]:
#     plt.yscale('log')
#     plt.ylim((1e-3, 1e0))

for ax in axes[-cols:]:
    ax.set_xlabel(r'$mag_{ACA \, pred} - mag_{ACA \, obs}$')
    plt.xlim((-0.2, 0.2))

plt.sca(axes[0])
plt.legend(fontsize='x-small', loc='upper left')
plt.suptitle(f'Residuals')
plt.tight_layout()
plt.savefig('residuals-quarterly.png')
No description has been provided for this image
In [12]:
# and in the following histogram we can see that the residuals in model 2 are larger
min_mag, max_mag = (6, 8)
# min_mag, max_mag = (9, 10)
# min_mag, max_mag = (10, 12)
min_mag, max_mag = (11, 13)
sel = (filled_train_data['mag_aca_obs'] > min_mag) & (filled_train_data['mag_aca_obs'] < max_mag)
sns.histplot(
    (np.abs(filled_train_data['mag_aca_res_2']) - np.abs(filled_train_data['mag_aca_res_1'][sel])),
    bins=100
)
plt.yscale('log')
No description has been provided for this image
In [13]:
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')

ax.scatter(
    res_v_mag_color['g_mag_mean'][res_v_mag_color['n'] > 2],
    res_v_mag_color['color_mean'][res_v_mag_color['n'] > 2],
    res_v_mag_color['mag_aca_res_2_mean'][res_v_mag_color['n'] > 2],
    s=1,
    # alpha=0.1,
    label='residuals'
)
Out[13]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x12a16ebc0>
No description has been provided for this image
In [14]:
dat = res_v_mag[res_v_mag['n'] > 30]
plt.errorbar(
    dat['mag_aca_mean'],
    dat['mag_aca_bias_1_mean'],
    yerr=dat['mag_aca_res_1_std'],
    fmt='.', label='Model 1'
)

# plt.plot(x, broken.predict(x), label='Broken')
plt.plot(dat['mag_aca_mean'], model_1.broken_.predict(dat['mag_aca_mean']), label='Broken')
/Users/javierg/SAO/Notebooks/agasc/gaia-magnitudes-2023/agasc_gaia/gaia_model.py:962: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  X = X[:, None]
Out[14]:
[<matplotlib.lines.Line2D at 0x12a2538e0>]
No description has been provided for this image
In [15]:
dat = res_v_mag_color[res_v_mag_color['n'] > 30]
plt.errorbar(
    dat['mag_aca_mean'],
    dat['mag_aca_bias_1_mean'],
    yerr=dat['mag_aca_res_1_std'] / np.sqrt(dat['n']),
    fmt='.', label='Model 1'
)
x = np.linspace(5.5, 11, 100)
plt.plot(x, model_1.broken_.predict(x), label='Broken')
Out[15]:
[<matplotlib.lines.Line2D at 0x12a466890>]
No description has been provided for this image
In [16]:
dat = res_v_mag_color[res_v_mag_color['n'] > 30]
plt.errorbar(
    dat['g_mag_mean'],
    dat['mag_aca_bias_1_mean'],
    yerr=dat['mag_aca_res_1_std'] / np.sqrt(dat['n']),
    fmt='.', label='Model 1')
x = np.linspace(5.5, 11, 100)
plt.plot(x, model_1.broken_.predict(x), label='Broken')
Out[16]:
[<matplotlib.lines.Line2D at 0x12a518fa0>]
No description has been provided for this image
In [17]:
# here we can see some outliers with mag < 8.5
# stars where the model gives a very different magnitude than what was actually observed
# it turns out that the issue here is in the AGASC supplement
sel = filled_train_data['g_mag'] < 8.5
res = filled_train_data[sel]['mag_aca_res_1']
plt.hist(
    res,
    bins=np.linspace(-5., 5., 101),
)
# plt.xlim((-0.1, 0.1))
plt.yscale('log')

q1, q2, q3 = np.quantile(res, [.25, .50, .75])
iqr = q3 - q1
v1 = q3 - 20 * iqr
v2 = q3 + 20 * iqr
outlier = (res > v2) | (res < v1)

plt.axvline(v1, color='k', linestyle='--')
plt.axvline(v2, color='k', linestyle='--')
filled_train_data[sel & outlier][['agasc_id']].values
Out[17]:
array([[ 130433016],
       [ 271454056],
       [ 271459424],
       [ 338830088],
       [ 338845600],
       [ 709002288],
       [ 724304304],
       [ 724828568],
       [ 822356856],
       [ 824336304],
       [ 932063320],
       [ 932067896],
       [ 934972992],
       [ 934973000],
       [ 935083512],
       [ 935085256],
       [ 998126280],
       [ 998639688],
       [1042299928],
       [1042815040],
       [1042820776],
       [1096567272],
       [1133385128],
       [1148205168],
       [1148592952]])
No description has been provided for this image
In [18]:
# here we can see some outliers with mag > 8.5
# stars where the model gives a very different magnitude than what was actually observed
# I have not checked where the issue comes from
sel = filled_train_data['g_mag'] > 8.5
res = filled_train_data[sel]['mag_aca_res_1']
plt.hist(
    res,
    bins=np.linspace(-5., 5., 101),
)
# plt.xlim((-0.1, 0.1))
plt.yscale('log')

q1, q2, q3 = np.quantile(res, [.25, .50, .75])
iqr = q3 - q1
v1 = q3 - 20 * iqr
v2 = q3 + 20 * iqr
outlier = (res > v2) | (res < v1)

plt.axvline(v1, color='k', linestyle='--')
plt.axvline(v2, color='k', linestyle='--')
filled_train_data[sel & outlier][['agasc_id']].values
Out[18]:
array([[    394488],
       [  41035896],
       [  42864912],
       [ 110892656],
       [ 195167088],
       [ 259149568],
       [ 263326640],
       [ 295583712],
       [ 338825648],
       [ 497824592],
       [ 505545160],
       [ 612370792],
       [ 649070072],
       [ 653919168],
       [ 731121880],
       [ 880545592],
       [ 897878136],
       [ 905715624],
       [1042300104],
       [1042302768],
       [1089217664],
       [1091193912],
       [1096420712],
       [1148203552],
       [1148716768],
       [1178221128]])
No description has been provided for this image
In [ ]: