import importlib
from astropy.table import Table

import pandas as pd
import seaborn as sns

import matplotlib
matplotlib.rcParams['font.size'] = 22
%matplotlib inline

sns.set_style("whitegrid")

pd.options.display.float_format = '{:,.3f}'.format
from agasc_gaia.gaia_model import get_missing_value_filler
from agasc_gaia import gaia_model
from agasc_gaia import datasets as ds
# %matplotlib inline

color_bins = np.linspace(-1, 5, 21)
mag_bins = np.linspace(0, 20, 41)

impute_train, _ = ds.get_gaia_min_bias_train_test()
missing_value_filler = get_missing_value_filler()

def make_profile(
        array,
        x_columns, y_columns,
        bins,
        other_columns=[]
):
    array = array.copy()
    assert isinstance(array, pd.DataFrame)
    if not isinstance(bins, list) or isinstance(bins, tuple):
        bins = [bins]
    if not isinstance(y_columns, list) or isinstance(y_columns, tuple):
        y_columns = [y_columns]
    if not isinstance(x_columns, list) or isinstance(x_columns, tuple):
        x_columns = [x_columns]
    assert len(bins) == len(x_columns)
    for x_col, col_bins in zip(x_columns, bins):
        array[f'__{x_col}_bin'] = np.digitize(array[x_col], col_bins)
    g = array.groupby([f'__{col}_bin' for col in x_columns])
    res = pd.DataFrame()
    res["n"] = g[y_columns[0]].count()
    for x_col in x_columns:
        res[f'{x_col}_mean'] = g[x_col].mean()
        res[f'{x_col}_std'] = g[x_col].std()
        res[f'{x_col}_min'] = g[x_col].min()
        res[f'{x_col}_max'] = g[x_col].max()
    for col in other_columns:
        res[f'{col}_mean'] = g[col].mean()
        res[f'{col}_std'] = g[col].std()
        res[f'{col}_min'] = g[col].min()
        res[f'{col}_max'] = g[col].max()
    for y_col in y_columns:
        res[f'{y_col}_mean'] = g[y_col].mean()
        res[f'{y_col}_std'] = g[y_col].std()
    res.index.names = [f'{x_col}_bin' for x_col in x_columns]
    return res

train_data = Table.read('data/agasc-gsc-tycho-gaia-x-match-train.fits')

train_data['mag_bin'] = np.digitize(train_data['ave_gaia_mag'], mag_bins)

sel = (
    (np.abs(train_data['mag_aca'] - train_data['ave_gaia_mag']) < 5)
    & (train_data['neighbor_mag_weight'] > 0.1)
    & (train_data['n_mag_neighbors'] == 1)
    & ~train_data['phot_variable_flag']
    & (train_data['has_mag'] > 0)
)
train_data = train_data[sel]

train_data['color'] = train_data['bp_mag'] - train_data['rp_mag']
train_data['color_bin'] = np.digitize(train_data['color'], color_bins)
train_data['mag_bin'] = np.digitize(train_data['g_mag'], mag_bins)

train_data = train_data.to_pandas()

test_data = Table.read('data/agasc-gsc-tycho-gaia-x-match-test.fits')
test_data['mag_bin'] = np.digitize(test_data['ave_gaia_mag'], mag_bins)
sel = (
    (np.abs(test_data['mag_aca'] - test_data['ave_gaia_mag']) < 5)
    & (test_data['neighbor_mag_weight'] > 0.1)
    & (test_data['n_mag_neighbors'] == 1)
    & ~test_data['phot_variable_flag']
    & (test_data['has_mag'] > 0)
)
test_data = test_data[sel]

test_data['color'] = test_data['bp_mag'] - test_data['rp_mag']
test_data['color_bin'] = np.digitize(test_data['color'], color_bins)
test_data['mag_bin'] = np.digitize(test_data['g_mag'], mag_bins)

test_data = test_data.to_pandas()

# train_data = train_data[np.abs(train_data['color']) < 0.5]

# importlib.reload(gaia_model)
deg = 2
model_1 = gaia_model.GaiaModel(
    missing_value_filler=get_missing_value_filler(),
    degree=deg,
    use_weights=False,
    fit_uncertainty=True,
    model="PCA",
    with_instrument_bias=True,
    mean_mag_threshold=8.5,
    exclude_outliers=True,
)
model_1.fit(train_data, impute_train.to_pandas())

model_2 = gaia_model.GaiaModel(
    missing_value_filler=get_missing_value_filler(),
    degree=deg,
    use_weights=False,
    fit_uncertainty=True,
    model="simple_color",
    with_instrument_bias=True,
    mean_mag_threshold=8.5,
    exclude_outliers=True,
)
model_2.fit(train_data, impute_train.to_pandas())

train_data['mag_aca_pred_1'] = model_1.predict(train_data)
train_data['mag_aca_pred_unbiased_1'] = model_1.predict(train_data, with_instrument_bias=False)
train_data['mag_aca_bias_1'] = train_data['mag_aca_pred_1'] - train_data['mag_aca_pred_unbiased_1']
train_data['mag_aca_pred_err_1'] = model_1.uncertainty(train_data)
train_data['mag_aca_res_1'] = train_data['mag_aca_pred_1'] - train_data['mag_aca_obs']

train_data['mag_aca_pred_2']  = model_2.predict(train_data)
train_data['mag_aca_pred_err_2'] = model_2.uncertainty(train_data)
train_data['mag_aca_res_2'] = train_data['mag_aca_pred_2'] - train_data['mag_aca_obs']

test_data['mag_aca_pred_1'] = model_1.predict(test_data)
test_data['mag_aca_pred_unbiased_1'] = model_1.predict(test_data, with_instrument_bias=False)
test_data['mag_aca_bias_1'] = test_data['mag_aca_pred_1'] - test_data['mag_aca_pred_unbiased_1']
test_data['mag_aca_pred_err_1'] = model_1.uncertainty(test_data)
test_data['mag_aca_res_1'] = test_data['mag_aca_pred_1'] - test_data['mag_aca_obs']

test_data['mag_aca_pred_2']  = model_2.predict(test_data)
test_data['mag_aca_pred_err_2']  = model_2.uncertainty(test_data)
test_data['mag_aca_res_2'] = test_data['mag_aca_pred_2'] - test_data['mag_aca_obs']
# making histograms with missing data makes my life harder
filled_train_data = missing_value_filler.fill_missing_values(train_data)
filled_test_data = missing_value_filler.fill_missing_values(test_data)
filled_test_data = filled_test_data[~np.isnan(filled_test_data['g_mag'])]


res_v_mag = make_profile(
    filled_train_data,
    'g_mag',
    ['mag_aca_res_1', 'mag_aca_res_2', 'mag_aca_bias_1'],
    mag_bins,
    other_columns=['mag_aca_obs', 'mag_aca']
)
res_v_color = make_profile(
    filled_train_data,
    'color',
    ['mag_aca_res_1', 'mag_aca_res_2', 'mag_aca_bias_1'],
    color_bins,
    other_columns=['mag_aca_obs', 'mag_aca']
)
res_v_mag_color = make_profile(
    filled_train_data,
    y_columns=['mag_aca_res_1', 'mag_aca_res_2', 'mag_aca_bias_1'],
    x_columns=['g_mag', 'color'],
    bins=[mag_bins, color_bins],
    other_columns=['mag_aca_obs', 'mag_aca'],
)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))
plt.sca(axes[0])
plt.errorbar(
    res_v_mag['mag_aca_mean'],
    res_v_mag['mag_aca_res_1_mean'],
    yerr=res_v_mag['mag_aca_res_1_std'] / np.sqrt(res_v_mag['n']),
    fmt='o',
    label='Model 1',
)
plt.errorbar(
    res_v_mag['mag_aca_mean']+0.01,
    res_v_mag['mag_aca_res_2_mean'],
    yerr=res_v_mag['mag_aca_res_2_std'] / np.sqrt(res_v_mag['n']),
    fmt='o',
    label='Model 2',
)
plt.legend()
plt.ylim((-0.2, 0.2))

plt.sca(axes[1])
plt.errorbar(
    res_v_color['color_mean'],
    res_v_color['mag_aca_res_1_mean'],
    yerr=res_v_color['mag_aca_res_1_std'] / np.sqrt(res_v_color['n']),
    fmt='o',
    label='Model 1',
)
plt.errorbar(
    res_v_color['color_mean']+0.01,
    res_v_color['mag_aca_res_2_mean'],
    yerr=res_v_color['mag_aca_res_2_std'] / np.sqrt(res_v_color['n']),
    fmt='o',
    label='Model 2',
)
plt.legend()
plt.ylim((-0.2, 0.2))

(-0.2, 0.2)

fig, axes = plt.subplot_mosaic(
    [['color', '.'], ['2d', 'mag']],
    # sharex=True,
    # sharey=True,
    layout="constrained",
    height_ratios=[1, 4],
    width_ratios=[3, 1],
    figsize=(8, 10)
)

axes['2d'].sharex(axes['color'])
axes['2d'].sharey(axes['mag'])

axes['color'].tick_params(labelbottom=False)
axes['mag'].tick_params(labelleft=False)

plt.sca(axes['2d'])
sns.histplot(
    x=filled_train_data['color'],
    y=filled_train_data['g_mag'],
)
plt.sca(axes['mag'])
sns.histplot(
    y=filled_train_data['g_mag'],
)
# plt.ylabel('')
plt.sca(axes['color'])
sns.histplot(
    x=filled_train_data['color'],
)
# plt.xlabel('')

<AxesSubplot: label='color', xlabel='color', ylabel='Count'>

n = len(filled_train_data)
frac = np.count_nonzero(
    (np.abs(filled_train_data.mag_aca_res_1) < filled_train_data.mag_aca_pred_err_1)
) / n
print(f"1-sigma coverage: {100*frac:.1f}%")

1-sigma coverage: 77.4%

width = 15
aspect = 1.5
cols = 3
mag_bin_indices = np.unique(filled_train_data.mag_bin)
mag_bin_indices = [idx for idx in mag_bin_indices if np.count_nonzero(filled_train_data.mag_bin == idx) > 20]
# mag_bin_indices = [13, 14, 15, 16]
y_res_bins = np.linspace(-.2, .2, 80)
# y_res_bins = np.linspace(-10., 10., 4000)


rows = len(mag_bin_indices) // cols + (1 if len(mag_bin_indices) % cols else 0)
fig, axes = plt.subplots(
    rows, cols, figsize=(width, rows*(width/cols/aspect)), sharex=True, sharey=True)
axes = axes.flatten()

for i, mag_bin_idx in enumerate(mag_bin_indices):
    ones = np.ones(np.count_nonzero(filled_train_data.mag_bin == mag_bin_idx))
    plt.sca(axes[i])
    vals, _ = np.histogram(
        filled_train_data.mag_aca_res_1[filled_train_data.mag_bin == mag_bin_idx],
        bins=y_res_bins
    )
    weight = ones/np.max(vals) if np.max(vals) else ones
    n, _, _ = plt.hist(
        filled_train_data.mag_aca_res_1[filled_train_data.mag_bin == mag_bin_idx],
        bins=y_res_bins,
        histtype='step',
        # density=True,
        weights=weight,
        label='PCA',
        linewidth=2
    )
    vals, _ = np.histogram(
        filled_train_data.mag_aca_res_2[filled_train_data.mag_bin == mag_bin_idx],
        bins=y_res_bins
    )
    weight = ones/np.max(vals) if np.max(vals) else ones
    n, _, _ = plt.hist(
        filled_train_data.mag_aca_res_2[filled_train_data.mag_bin == mag_bin_idx],
        bins=y_res_bins,
        histtype='step',
        # density=True,
        weights=weight,
        label='Simple Color',
        linewidth=2
    )
    mag = np.mean(filled_train_data.ave_gaia_mag[filled_train_data.mag_bin == mag_bin_idx])
    plt.axvline(-np.sqrt(model_1._base_variance), color='k', linestyle='--')
    plt.axvline(np.sqrt(model_1._base_variance), color='k', linestyle='--')
    plt.axvline(-np.sqrt(model_2._base_variance) + 0.001, color='r', linestyle='--')
    plt.axvline(np.sqrt(model_2._base_variance) + 0.001, color='r', linestyle='--')

    plt.axvline(0, color='k', linestyle='-')
    uncertainty = np.mean(filled_train_data.mag_aca_pred_err_1[filled_train_data.mag_bin == mag_bin_idx])
    plt.axvline(-uncertainty, color='r', linestyle=':')
    plt.axvline(uncertainty, color='r', linestyle=':')
    n = np.count_nonzero(filled_train_data.mag_bin == mag_bin_idx)
    frac = np.count_nonzero(
        (filled_train_data.mag_bin == mag_bin_idx)
        & (np.abs(filled_train_data.mag_aca_res_1) < filled_train_data.mag_aca_pred_err_1)
    ) / n
    plt.text(0.05, 0.8, f'$\sigma$ = {uncertainty:.3f}\ncoverage = {100*frac:.0f}%', fontsize='xx-small')
    plt.title(f'ave_gaia_mag = {mag:.2f}', fontsize='x-small')

# for ax in axes[::cols]:
#     plt.yscale('log')
#     plt.ylim((1e-3, 1e0))

for ax in axes[-cols:]:
    ax.set_xlabel(r'$mag_{ACA \, pred} - mag_{ACA \, obs}$')
    plt.xlim((-0.2, 0.2))

plt.sca(axes[0])
plt.legend(fontsize='x-small', loc='upper left')
plt.suptitle(f'Residuals')
plt.tight_layout()
plt.savefig('residuals-quarterly.png')

# and in the following histogram we can see that the residuals in model 2 are larger
min_mag, max_mag = (6, 8)
# min_mag, max_mag = (9, 10)
# min_mag, max_mag = (10, 12)
min_mag, max_mag = (11, 13)
sel = (filled_train_data['mag_aca_obs'] > min_mag) & (filled_train_data['mag_aca_obs'] < max_mag)
sns.histplot(
    (np.abs(filled_train_data['mag_aca_res_2']) - np.abs(filled_train_data['mag_aca_res_1'][sel])),
    bins=100
)
plt.yscale('log')

fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')

ax.scatter(
    res_v_mag_color['g_mag_mean'][res_v_mag_color['n'] > 2],
    res_v_mag_color['color_mean'][res_v_mag_color['n'] > 2],
    res_v_mag_color['mag_aca_res_2_mean'][res_v_mag_color['n'] > 2],
    s=1,
    # alpha=0.1,
    label='residuals'
)

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x12a16ebc0>

dat = res_v_mag[res_v_mag['n'] > 30]
plt.errorbar(
    dat['mag_aca_mean'],
    dat['mag_aca_bias_1_mean'],
    yerr=dat['mag_aca_res_1_std'],
    fmt='.', label='Model 1'
)

# plt.plot(x, broken.predict(x), label='Broken')
plt.plot(dat['mag_aca_mean'], model_1.broken_.predict(dat['mag_aca_mean']), label='Broken')

/Users/javierg/SAO/Notebooks/agasc/gaia-magnitudes-2023/agasc_gaia/gaia_model.py:962: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  X = X[:, None]

[<matplotlib.lines.Line2D at 0x12a2538e0>]

dat = res_v_mag_color[res_v_mag_color['n'] > 30]
plt.errorbar(
    dat['mag_aca_mean'],
    dat['mag_aca_bias_1_mean'],
    yerr=dat['mag_aca_res_1_std'] / np.sqrt(dat['n']),
    fmt='.', label='Model 1'
)
x = np.linspace(5.5, 11, 100)
plt.plot(x, model_1.broken_.predict(x), label='Broken')

[<matplotlib.lines.Line2D at 0x12a466890>]

dat = res_v_mag_color[res_v_mag_color['n'] > 30]
plt.errorbar(
    dat['g_mag_mean'],
    dat['mag_aca_bias_1_mean'],
    yerr=dat['mag_aca_res_1_std'] / np.sqrt(dat['n']),
    fmt='.', label='Model 1')
x = np.linspace(5.5, 11, 100)
plt.plot(x, model_1.broken_.predict(x), label='Broken')

[<matplotlib.lines.Line2D at 0x12a518fa0>]

# here we can see some outliers with mag < 8.5
# stars where the model gives a very different magnitude than what was actually observed
# it turns out that the issue here is in the AGASC supplement
sel = filled_train_data['g_mag'] < 8.5
res = filled_train_data[sel]['mag_aca_res_1']
plt.hist(
    res,
    bins=np.linspace(-5., 5., 101),
)
# plt.xlim((-0.1, 0.1))
plt.yscale('log')

q1, q2, q3 = np.quantile(res, [.25, .50, .75])
iqr = q3 - q1
v1 = q3 - 20 * iqr
v2 = q3 + 20 * iqr
outlier = (res > v2) | (res < v1)

plt.axvline(v1, color='k', linestyle='--')
plt.axvline(v2, color='k', linestyle='--')
filled_train_data[sel & outlier][['agasc_id']].values

array([[ 130433016],
       [ 271454056],
       [ 271459424],
       [ 338830088],
       [ 338845600],
       [ 709002288],
       [ 724304304],
       [ 724828568],
       [ 822356856],
       [ 824336304],
       [ 932063320],
       [ 932067896],
       [ 934972992],
       [ 934973000],
       [ 935083512],
       [ 935085256],
       [ 998126280],
       [ 998639688],
       [1042299928],
       [1042815040],
       [1042820776],
       [1096567272],
       [1133385128],
       [1148205168],
       [1148592952]])

# here we can see some outliers with mag > 8.5
# stars where the model gives a very different magnitude than what was actually observed
# I have not checked where the issue comes from
sel = filled_train_data['g_mag'] > 8.5
res = filled_train_data[sel]['mag_aca_res_1']
plt.hist(
    res,
    bins=np.linspace(-5., 5., 101),
)
# plt.xlim((-0.1, 0.1))
plt.yscale('log')

q1, q2, q3 = np.quantile(res, [.25, .50, .75])
iqr = q3 - q1
v1 = q3 - 20 * iqr
v2 = q3 + 20 * iqr
outlier = (res > v2) | (res < v1)

plt.axvline(v1, color='k', linestyle='--')
plt.axvline(v2, color='k', linestyle='--')
filled_train_data[sel & outlier][['agasc_id']].values

array([[    394488],
       [  41035896],
       [  42864912],
       [ 110892656],
       [ 195167088],
       [ 259149568],
       [ 263326640],
       [ 295583712],
       [ 338825648],
       [ 497824592],
       [ 505545160],
       [ 612370792],
       [ 649070072],
       [ 653919168],
       [ 731121880],
       [ 880545592],
       [ 897878136],
       [ 905715624],
       [1042300104],
       [1042302768],
       [1089217664],
       [1091193912],
       [1096420712],
       [1148203552],
       [1148716768],
       [1178221128]])

NOTE¶