from astropy.table import Table

import pandas as pd
import seaborn as sns

import matplotlib
matplotlib.rcParams['font.size'] = 22

sns.set_style("whitegrid")

pd.options.display.float_format = '{:,.3f}'.format

# required for the iterative imputer in the next line
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

%matplotlib inline

data = Table.read('data/agasc-gsc-tycho-gaia-x-match-train.fits')

data['mag_bin'] = np.digitize(data['ave_gaia_mag'], np.linspace(0, 20, 41))

sel = (
    (np.abs(data['mag_aca'] - data['ave_gaia_mag']) < 5)
    & (data['neighbor_mag_weight'] > 0.1)
    & (data['n_mag_neighbors'] == 1)
    & ~data['phot_variable_flag']
    & (data['has_mag'] > 0)
)
data = data[sel]

data = data.to_pandas()

def plot_weights(train):
    fig=plt.figure(figsize=(16, 6))

    ax1 = plt.subplot(131)
    ax2 = plt.subplot(132, sharex=ax1, )
    ax3 = plt.subplot(133, sharex=ax1, sharey=ax2)
    axes = [ax1, ax2, ax3]

    plt.sca(axes[0])

    plt.hist(
        train['ave_gaia_mag'],
        bins=b,
        histtype='step'
    )
    plt.yscale('log')
    plt.title('Magnitude Histogram')
    plt.xlabel('Average Gaia Mag.')
    plt.xlim((5, 12.5))

    plt.sca(axes[1])
    plt.plot(
        train['ave_gaia_mag'],
        train['stat_weight'],
        '.'
    )
    plt.title('Statistical Weight')
    plt.xlabel('Average Gaia Mag.')
    plt.yscale('log')
    plt.xlim((5, 11.5))

    plt.sca(axes[2])
    plt.plot(
        train['ave_gaia_mag'],
        train['weight'],
        '.'
    )
    plt.title('Weight')
    plt.xlabel('Average Gaia Mag.')
    plt.yscale('log')
    plt.xlim((5, 12))

    plt.tight_layout()

    return fig, axes
    

def plot_res_scatter(train, test):
    fig, axes = plt.subplots(
        1, 2,
        figsize=(16, 4),
        sharey=True,
        sharex=True
    )
    plt.sca(axes[0])
    plt.scatter(train.mag_aca_obs, train['y_res'])
    plt.ylabel(r'mag_aca$_{pred}$ - mag_aca$_{true}$')
    plt.xlabel('mag_aca')
    plt.title('Train')
    plt.sca(axes[1])
    plt.scatter(test.mag_aca_obs, test['y_res'])
    plt.title('Test')
    plt.xlabel('mag_aca')
    return fig, axes

def plot_res_1d(train, test):
    # fig = plt.figure(figsize=(12, 8))
    fig = plt.figure(figsize=(9.75, 6.5))
    plt.hist(
        train['y_res'],
        density=True,
        bins=np.linspace(-2.5, 2.5, 120),
        histtype='step',
        label='Train',
        linewidth=2
    )
    plt.hist(
        test['y_res'],
        density=True,
        bins=np.linspace(-2.5, 2.5, 120),
        histtype='step',
        label='Test',
        linewidth=2
    )

    plt.ylabel('probability density')
    plt.xlabel('Predicted - observed magnitude')
    plt.title('Distribution of magnitude errors (test/train)')
    plt.yscale('log')

    plt.legend(loc='upper left')

    plt.tight_layout()
    return fig, axes

def plot_res_2d(train):
    fig = plt.figure(figsize=(16, 8))
    sns.histplot(
        train,
        x='ave_gaia_mag', y='y_res',
        ax=plt.gca(),
        bins=[
            np.linspace(5, 12, 81),
            np.linspace(-1., 1, 161)
        ]
    )
    plt.ylim((-0.2, 0.2))
    return fig

def plot_res_v_mag(residuals, t_residuals):
    fig, axes = plt.subplots(
        2, 1,
        figsize=(12, 10),
        sharex=True,
        gridspec_kw={'height_ratios': [3, 1]}
    )

    plt.suptitle(f'Residuals')

    plt.sca(axes[0])
    plt.errorbar(
        residuals['ave_gaia_mag'].values,
        residuals['y_res'].values,
        yerr=residuals['d_y_res'] / np.sqrt(residuals['n']),
        marker='o',
        linestyle='none',
        label='train',
    )
    plt.errorbar(
        t_residuals['ave_gaia_mag'].values,
        t_residuals['y_res'].values,
        yerr=t_residuals['d_y_res'] / np.sqrt(t_residuals['n']),
        marker='o',
        linestyle='none',
        label='test'
    )
    plt.ylabel('residuals')
    plt.ylim((-.3, 1.))
    plt.legend(loc='upper left', fontsize='x-small')

    plt.sca(axes[1])
    plt.scatter(
        residuals['ave_gaia_mag'].values,
        residuals['y_res'].values / residuals['d_y_res'],
        marker='o'
    )
    plt.scatter(
        t_residuals['ave_gaia_mag'].values,
        t_residuals['y_res'].values / t_residuals['d_y_res'],
        marker='o'
    )
    plt.ylabel('pull')
    plt.ylim((-2, 2))

    plt.xlabel('mean Gaia mag')

    plt.tight_layout()
    
    return fig, axes

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

def get_weights(train, plot=False):
    v, b = np.histogram(
        (train['g_mag'] + train['bp_mag'] + train['rp_mag']) / 3,
        bins=np.linspace(0, 20, 241),
    )
    m = v>=200
    v[~m] = v[m].min()
    w = 1/v
    i = np.digitize(train['ave_gaia_mag'], b) - 1
    train['stat_weight'] = 0.
    train['stat_weight'] = w[i]


    thr = 9.
    train.loc[i > np.digitize(thr, b), 'stat_weight'] = w[np.digitize(thr, b)]

    train['stat_weight'] /= np.max(train['stat_weight'])
    train['weight'] = (
        train['stat_weight'] 
        * train['mag_aca_err_obs'].min()/train['mag_aca_err_obs']
    )
    
    if plot:
        plot_weights(train)

    return train['weight']


class GaiaSimpleFeatures(TransformerMixin, BaseEstimator):
    def __init__(self, n=1):
        self.n = n
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        n_samples, n_features = X.shape
        XP = np.empty(shape=(n_samples, 1 + self.n), dtype=X.dtype)
        XP[:, 0] = np.mean(X, axis=1)
        # XP[:,1] = X[:,0] - X[:,1]
        XP[:,1] = X[:,2] - X[:,1]
        for i in range(self.n - 1):
            XP[:,2 + i] = (X[:,0] - X[:,1])**(2 + i)
        return XP
    def get_feature_names(self, input_features=None):
        names = ['mean_mag', 'color']
        for i in range(self.n - 1):
            names += [f'mean_mag_{i+2}']
        return names
    def get_feature_names_out(self, input_features=None):
        names = ['mean_mag', 'color']
        for i in range(self.n - 1):
            names += [f'mean_mag_{i+2}']
        return names


class Dummy(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        X, y = self._validate_data(X, y)
        print('FITTING')
        print('X:', X.shape, np.any(np.isnan(X)), np.min(X), np.min(X))
        print('y:', y.shape, np.any(np.isnan(y)), np.min(y), np.min(y))
        return self
    def transform(self, X):
        return X
    def get_feature_names(self, input_features=None):
        return []
    def get_feature_names_out(self, input_features=None):
        return []

class LogX:
    def __init__(self):
        pass

    def __call__(self, X:pd.DataFrame, y=None) -> pd.DataFrame:
        res = np.empty(shape=(X.shape[0], 3))
        res[:,0] = np.log(X[:,0])
        res[:,1] = X[:,1]
        res[:,2] = np.log(X[:,0])**2
        return res

# BaseEstimator adds get_params, set_params, _validate_data and other goodies
# RegressorMixin adds a standard score() member function
class AdHoc(RegressorMixin, BaseEstimator):
    def __init__(self, low_threshold=1e10, method=1):
        self.low_threshold = low_threshold
        self.method = method

    def fit(self, X, y=None, **kwargs):
        self.linear_ = LinearRegression()

        if 'sample_weight' in kwargs:
            sigma = 1./kwargs['sample_weight']
            sigma /= np.mean(sigma)
        else:
            sigma = None
        X, y = self._validate_data(X, y)

        m = X[:,0] < self.low_threshold
        self.linear_.fit(X[m], y[m])

        y2 = (self.linear_.predict(X) - y)
        
        self.mean_ = np.array([np.mean(X), np.mean(y2)])
        X = X - self.mean_[0]
        y2 = y2 - self.mean_[1]
        
        self.std_ = np.array([np.std(X), np.std(y2)])
        X /= self.std_[0]
        y2 /= self.std_[1]

        if self.method == 1:
            self.params_, self.covariance_ = scipy.optimize.curve_fit(self._broken_1, X.T[0], y2, p0=[0., 0.5, 0., 0.], sigma=sigma)
        elif self.method == 2:
            self.params_, self.covariance_ = scipy.optimize.curve_fit(self._broken_2, X.T[0], y2, p0=[0.5, 0.], sigma=sigma)

        return self

    def predict(self, X):
        broken = self._broken_1 if self.method == 1 else self._broken_2
        check_is_fitted(self)
        X = self._validate_data(X)
        y = self.linear_.predict(X)
        if self.method == 0:
            y_res = 0
        else:
            y_res = self.mean_[1] + self.std_[1] * broken((X.T[0] - self.mean_[0]) / self.std_[0], *self.params_)
        return y - y_res

    @staticmethod
    def _broken_1(x, b, point, m1, m2):
        y = np.empty(len(x))
        y[x < point] = b + m1 * x[x < point]
        y[x >= point] = b + m1 * (point) + m2 * (x[x >= point] - point)
        return y

    @staticmethod
    def _broken_2(x, point, m2):
        y = np.empty(len(x))
        y[x < point] = 0
        y[x >= point] = m2 * (x[x >= point] - point)
        return y


class Broken(RegressorMixin, BaseEstimator):
    def __init__(self, threshold=0):
        self.threshold = threshold

    def fit(self, X, y=None):
        self.linear_1_ = LinearRegression()
        self.linear_2_ = LinearRegression()

        
        X, y = self._validate_data(X, y)

        x = X[:,0]
        print(np.min(x), np.max(x), self.threshold)
        m = x < self.threshold
        print(np.count_nonzero(m))
        self.linear_1_.fit(X[m], y[m])

        m = x >= self.threshold
        print(np.count_nonzero(m))
        self.linear_2_.fit(X[m], y[m])

        return self

    def predict(self, X):
        check_is_fitted(self)
        X = self._validate_data(X)
        
        y = np.empty(len(X))
        m = X[:,0] < self.threshold
        if np.count_nonzero(m):
            y[m] = self.linear_1_.predict(X[m])
        m = X[:,0] >= self.threshold
        if np.count_nonzero(m):
            y[m] = self.linear_2_.predict(X[m])
        return y



# from sklearn.utils.estimator_checks import check_estimator
# check_estimator(AdHoc())
# check_estimator(Broken())

def linear_pca_1():
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('pca', PCA(n_components=1)),
            ('linear', LinearRegression())
        ]
    )
    return pipeline

def linear_pca_2():
    return Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('pca', PCA(n_components=2)),
            ('linear', LinearRegression())
        ]
    )
    return pipeline

def linear_pca_3():
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('pca', PCA(n_components=3)),
            ('linear', LinearRegression())
        ]
    )
    return pipeline

def ad_hoc():
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('simple', GaiaSimpleFeatures()),
            ('ad_hoc', AdHoc(low_threshold=9)),
        ]
    )
    return pipeline

def simple():
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('simple', GaiaSimpleFeatures()),
            ('linear', LinearRegression())
        ]
    )
    return pipeline

def simple2():
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('simple', GaiaSimpleFeatures(n=2)),
            ('linear', LinearRegression())
        ]
    )
    return pipeline

def simple_log():
    from sklearn.compose import TransformedTargetRegressor
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('simple', GaiaSimpleFeatures()),
            # ('scaler', StandardScaler()),
            ('log', FunctionTransformer(LogX())),
            # ('dummy', Dummy()),
            ('linear', TransformedTargetRegressor(
                regressor=LinearRegression(),
                func=np.log,
                inverse_func=np.exp,
                check_inverse=False
            ))
        ]
    )
    return pipeline

def rf():
    from sklearn.ensemble import RandomForestRegressor
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('scale', StandardScaler()),
            ('rf', RandomForestRegressor(n_estimators=20, min_samples_split=10))
        ]
    )
    return pipeline

pipelines = {
    'ad_hoc': {
        'description': 'Ad-Hoc',
        'pipeline': ad_hoc,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'ad_hoc_w': {
        'description': 'Ad-Hoc',
        'pipeline': ad_hoc,
        'args': lambda d: {'ad_hoc__sample_weight': get_weights(d)},
        'select': lambda d: d,
    },
    'ad_hoc_2': {
        'description': 'Ad-Hoc',
        'pipeline': lambda : Pipeline(
            steps=[
                ('imputer', IterativeImputer(max_iter=50)),
                ('simple', GaiaSimpleFeatures()),
                ('ad_hoc', AdHoc(low_threshold=9, method=2)),
            ]
        ),
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'ad_hoc_3': {
        'description': 'Ad-Hoc',
        'pipeline': lambda : Pipeline(
            steps=[
                ('imputer', IterativeImputer(max_iter=50)),
                ('simple', GaiaSimpleFeatures()),
                ('ad_hoc', AdHoc(low_threshold=8, method=1)),
            ]
        ),
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'rf': {
        'description': 'Random Forest',
        'pipeline': rf,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'linear_pca_1': {
        'description': '1d PCA',
        'pipeline': linear_pca_1,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'linear_pca_2': {
        'description': '2d PCA',
        'pipeline': linear_pca_2,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'linear_pca_3': {
        'description': '3d PCA',
        'pipeline': linear_pca_3,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'simple': {
        'description': 'Simple, not weighted',
        'pipeline': simple,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'simple_9': {
        'description': 'Simple, 9 min mag, weighted',
        'pipeline': simple,
        'args': lambda d: {'linear__sample_weight': get_weights(d)},
        'select': lambda d: d[(d['ave_gaia_mag'] < 9.)].reset_index(),
    },
    'simple_10_3': {
        'description': 'Simple, 10.3 min mag, weighted',
        'pipeline': simple,
        'args': lambda d: {'linear__sample_weight': get_weights(d)},
        'select': lambda d: d[(d['ave_gaia_mag'] < 10.3)].reset_index(),
    },
    'simple_n2': {
        'description': 'Simple, pol-2, weighted',
        'pipeline': simple2,
        'args': lambda d: {'linear__sample_weight': get_weights(d)},
        'select': lambda d: d,
    },
    'simple_log': {
        'description': 'Simple, log',
        'pipeline': simple_log,
        'args': lambda d: {},
        'select': lambda d: d,
    },
}

def mse_low(y_true, y_pred):
    m = y_true < 7
    return np.mean((y_true[m] - y_pred[m])**2)

def mse_mid(y_true, y_pred):
    m = (y_true > 9) & (y_true < 10)
    return np.mean((y_true[m] - y_pred[m])**2)

def mse_hi(y_true, y_pred):
    m = y_true > 10.5
    return np.mean((y_true[m] - y_pred[m])**2)

def mse(y_true, y_pred):
    a = np.mean((y_true - y_pred)**2)
    return a

def mse_r(y_true, y_pred, bmin, bmax):
    m = (y_true > bmin) & (y_true < bmax)
    return np.mean((y_true[m] - y_pred[m])**2)

def var(y_true, y_pred):
    return np.var(y_true - y_pred)

def q90(y_true, y_pred):
    q5, q95 = np.quantile(y_true - y_pred, [0.05, 0.95])
    return q95 - q5

def q99(y_true, y_pred):
    q5, q95 = np.quantile(y_true - y_pred, [0.005, 0.995])
    return q95 - q5

def q90_r(y_true, y_pred, bmin, bmax):
    m = (y_true > bmin) & (y_true < bmax)
    q5, q95 = np.quantile(y_true[m] - y_pred[m], [0.05, 0.95])
    return q95 - q5

def q99_r(y_true, y_pred, bmin, bmax):
    m = (y_true > bmin) & (y_true < bmax)
    q5, q95 = np.quantile(y_true[m] - y_pred[m], [0.005, 0.995])
    return q95 - q5

def mean_mag_r(y_true, y_pred, bmin, bmax):
    m = (y_true > bmin) & (y_true < bmax)
    return np.mean(y_true[m])

pipeline = pipelines['ad_hoc']['pipeline']()

x_train = data[['g_mag', 'bp_mag', 'rp_mag']]
y_train = data[['mag_aca_obs']]['mag_aca_obs']

pipeline.fit(x_train, y_train)

Pipeline(steps=[('imputer', IterativeImputer(max_iter=50)),
                ('simple', GaiaSimpleFeatures()),
                ('ad_hoc', AdHoc(low_threshold=9))])

Pipeline(steps=[('imputer', IterativeImputer(max_iter=50)),
                ('simple', GaiaSimpleFeatures()),
                ('ad_hoc', AdHoc(low_threshold=9))])

IterativeImputer(max_iter=50)

GaiaSimpleFeatures()

AdHoc(low_threshold=9)

y_pred = pipeline.predict(x_train)
y_true = y_train.values

from sklearn.metrics import make_scorer

scoring = {
    'mse_low': make_scorer(mse_low, greater_is_better=False),
    'mse_mid': make_scorer(mse_mid, greater_is_better=False),
    'mse_hi': make_scorer(mse_hi, greater_is_better=False),
    'mse': make_scorer(mse, greater_is_better=False),
    'q90': make_scorer(q90),
    'q99': make_scorer(q99),
}
bins = np.linspace(5.5, 11.5, 13)
for i, (bmin, bmax) in enumerate(zip(bins[:-1], bins[1:])):
    scoring[f'mse_bin_{i}'] = make_scorer(lambda y_true, y_pred, b1=bmin, b2=bmax: mse_r(y_true, y_pred, b1, b2))
    scoring[f'q90_bin_{i}'] = make_scorer(lambda y_true, y_pred, b1=bmin, b2=bmax: q90_r(y_true, y_pred, b1, b2))
    scoring[f'q99_bin_{i}'] = make_scorer(lambda y_true, y_pred, b1=bmin, b2=bmax: q99_r(y_true, y_pred, b1, b2))
    scoring[f'mean_mag_bin_{i}'] = make_scorer(lambda y_true, y_pred, b1=bmin, b2=bmax: mean_mag_r(y_true, y_pred, b1, b2))

x_train = data[['g_mag', 'bp_mag', 'rp_mag']]
y_train = data[['mag_aca_obs']]['mag_aca_obs']


scores_list = []
names = ['simple', 'rf', 'ad_hoc', 'simple_10_3', 'simple_9', 'linear_pca_2', 'linear_pca_3', 'ad_hoc_2']
names = ['simple', 'rf', 'ad_hoc', 'linear_pca_2', 'linear_pca_3']
for name in names:
    print(name)
    n_cv = 10
    pipeline = pipelines[name]['pipeline']()
    cv_scores = cross_validate(pipeline, x_train, y_train, scoring=scoring, cv=n_cv, return_train_score=True)
    cv_scores['name'] = [name] * n_cv
    scores_list.append(cv_scores)

simple
rf
ad_hoc
linear_pca_2
linear_pca_3

scores = pd.DataFrame({key: np.concatenate([s[key] for s in scores_list]) for key in scores_list[0].keys()})

score_summary = pd.DataFrame()
s = scores.groupby('name').mean()

mag_cols = [col for col in scores.columns if re.match('test_mean_mag_bin_', col)]
score_summary['mean_mag'] = s.loc['simple'][mag_cols].values

mse_cols = [col for col in scores.columns if re.match('test_mse_bin_', col)]
q90_cols = [col for col in scores.columns if re.match('test_q90_bin_', col)]
q99_cols = [col for col in scores.columns if re.match('test_q99_bin_', col)]
cols = [('mean_mag', '')]
for name in s.index:
    cols += [(name, 'q90'), (name, 'q99'), (name, 'mse')]
    score_summary[name, 'q90'] = np.sqrt(s.loc[name][q90_cols].values)
    score_summary[name, 'q99'] = np.sqrt(s.loc[name][q99_cols].values)
    score_summary[name, 'mse'] = np.sqrt(s.loc[name][mse_cols].values)

score_summary.columns = pd.MultiIndex.from_tuples(cols)

score_summary

fig = plt.figure(figsize=(12, 8))
plt.plot(score_summary['mean_mag'], score_summary['ad_hoc', 'q99'], 'o-', label='ad_hoc')
plt.plot(score_summary['mean_mag'], score_summary['rf', 'q99'], label='rf')
plt.plot(score_summary['mean_mag'], score_summary['simple', 'q99'], label='simple')
plt.plot(score_summary['mean_mag'], score_summary['linear_pca_2', 'q99'], label='PCA2')
plt.plot(score_summary['mean_mag'], score_summary['linear_pca_3', 'q99'], label='PCA3')
plt.legend(loc='upper left')

<matplotlib.legend.Legend at 0x12f922080>

fig = plt.figure(figsize=(12, 8))
plt.plot(score_summary['mean_mag'], score_summary['ad_hoc', 'q90'], 'o-', label='ad_hoc')
plt.plot(score_summary['mean_mag'], score_summary['rf', 'q90'], label='rf')
plt.plot(score_summary['mean_mag'], score_summary['simple', 'q90'], label='simple')
plt.plot(score_summary['mean_mag'], score_summary['linear_pca_2', 'q90'], label='PCA2')
plt.plot(score_summary['mean_mag'], score_summary['linear_pca_3', 'q90'], label='PCA3')
plt.legend(loc='upper left')

<matplotlib.legend.Legend at 0x12e65d0c0>

fig = plt.figure(figsize=(12, 8))
plt.plot(score_summary['mean_mag'], score_summary['ad_hoc', 'mse'], 'o-', label='ad_hoc')
plt.plot(score_summary['mean_mag'], score_summary['rf', 'mse'], label='rf')
# plt.plot(score_summary['mean_mag'], score_summary['simple', 'mse'], label='simple')
# plt.plot(score_summary['mean_mag'], score_summary['linear_pca_2', 'mse'], label='PCA2')
plt.plot(score_summary['mean_mag'], score_summary['linear_pca_3', 'mse'], label='PCA3')
plt.legend(loc='upper left')

<matplotlib.legend.Legend at 0x13037d5a0>

fig = plt.figure(figsize=(12, 8))
plt.plot(score_summary['mean_mag'], score_summary['ad_hoc'], 'o-', label='ad_hoc')
plt.legend(loc='upper left')

<matplotlib.legend.Legend at 0x13049a9e0>

def do(name, train, test):
    info = pipelines[name]

    if info['select'] is not None:
        train = info['select'](train)
    pipeline = info['pipeline']()
    args = info['args'](train)

    x_train = train[['g_mag', 'bp_mag', 'rp_mag']]
    y_train = train[['mag_aca_obs']]

    x_test = test[['g_mag', 'bp_mag', 'rp_mag']]
    y_test = test[['mag_aca_obs']]


    pipeline.fit(x_train, y_train['mag_aca_obs'], **args)

    train['y_pred'] = pipeline.predict(x_train)
    test['y_pred'] = pipeline.predict(x_test)

    train['y_res'] = train['y_pred'] - y_train['mag_aca_obs']
    test['y_res'] = test['y_pred'] - y_test['mag_aca_obs']
    
    return pipeline, train, test

def binned_residuals(train):
    residuals = train.groupby('mag_bin')[['ave_gaia_mag', 'y_res']].mean()
    residuals['d_y_res'] = train.groupby('mag_bin')[['y_res']].std()
    residuals['n'] = train.groupby('mag_bin')['y_res'].count()
    residuals = residuals[residuals['n'] > 1]
    return residuals

i = 10
x_val = (data['x_val_sample'] >= i) & (data['x_val_sample'] <= i+1)
train = data[~x_val].reset_index()
test = data[x_val].reset_index()
pipeline = do('ad_hoc', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
fig, axes = plot_res_v_mag(residuals, t_residuals)

# axes[0].set_ylim((-0.1, 0.1))

i = 10
x_val = (data['x_val_sample'] >= i) & (data['x_val_sample'] <= i+1)
train = data[~x_val].reset_index()
test = data[x_val].reset_index()
pipeline = do('ad_hoc_2', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
fig, axes = plot_res_v_mag(residuals, t_residuals)

# axes[0].set_ylim((-0.1, 0.1))

train = data[(data['x_val_sample'] > 1)].reset_index()
test = data[(data['x_val_sample'] <= 1)].reset_index()
pipeline = do('ad_hoc_w', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
fig, axes = plot_res_v_mag(residuals, t_residuals)

axes[0].set_ylim((-0.1, 0.1))

(-0.1, 0.1)

train = data[(data['x_val_sample'] > 1)].reset_index()
test = data[(data['x_val_sample'] <= 1)].reset_index()
pipeline = do('simple', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
plot_res_v_mag(residuals, t_residuals)

(<Figure size 1200x1000 with 2 Axes>,
 array([<AxesSubplot: ylabel='residuals'>,
        <AxesSubplot: xlabel='mean Gaia mag', ylabel='pull'>], dtype=object))

train = data[(data['x_val_sample'] > 1)].reset_index()
test = data[(data['x_val_sample'] <= 1)].reset_index()
pipeline = do('rf', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(test)
plot_res_v_mag(residuals, t_residuals)

(<Figure size 1200x1000 with 2 Axes>,
 array([<AxesSubplot: ylabel='residuals'>,
        <AxesSubplot: xlabel='mean Gaia mag', ylabel='pull'>], dtype=object))

train = data[(data['x_val_sample'] > 1)].reset_index()
test = data[(data['x_val_sample'] <= 1)].reset_index()
pipeline, train, test = do('simple_10_3', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
plot_res_v_mag(residuals, t_residuals)

(<Figure size 1200x1000 with 2 Axes>,
 array([<AxesSubplot: ylabel='residuals'>,
        <AxesSubplot: xlabel='mean Gaia mag', ylabel='pull'>], dtype=object))

i = 6
x_val = (data['x_val_sample'] >= i) & (data['x_val_sample'] <= i+1)
train = data[~x_val].reset_index()
test = data[x_val].reset_index()
pipeline, train, test = do('simple_9', train, test)
x_train = train[['g_mag', 'bp_mag', 'rp_mag']]
y_train = train[['mag_aca_obs']]
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)

fig, axes = plot_res_v_mag(residuals, t_residuals)

def f(x):
    a = 1/2 + np.arctan((x - 9.4) / 0.45)/np.pi
    s = 0.07 + 0.1 * (x - 10)
    return 0.012 + a * s

# plt.ylim((-0.1, 0.2))
x = np.linspace(5, 12, 71)
plt.sca(axes[0])
plt.plot(x, f(x))

[<matplotlib.lines.Line2D at 0x1307aecb0>]

x_val = (data['x_val_sample'] >= 6) & (data['x_val_sample'] <= 7)
train = data[~x_val].reset_index()
test = data[x_val].reset_index()
pipeline, train, test = do('simple_n2', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
plot_res_v_mag(residuals, t_residuals)

(<Figure size 1200x1000 with 2 Axes>,
 array([<AxesSubplot: ylabel='residuals'>,
        <AxesSubplot: xlabel='mean Gaia mag', ylabel='pull'>], dtype=object))

train = data[(data['x_val_sample'] > 1)].reset_index()
test = data[(data['x_val_sample'] <= 1)].reset_index()
pipeline, train, test = do('simple_log', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
plot_res_v_mag(residuals, t_residuals)

(<Figure size 1200x1000 with 2 Axes>,
 array([<AxesSubplot: ylabel='residuals'>,
        <AxesSubplot: xlabel='mean Gaia mag', ylabel='pull'>], dtype=object))

	mean_mag	ad_hoc			linear_pca_2			linear_pca_3			rf			simple
		q90	q99	mse	q90	q99	mse	q90	q99	mse	q90	q99	mse	q90	q99	mse
0	5.833	0.261	0.302	0.029	0.257	0.297	0.057	0.269	0.305	0.058	0.343	0.449	0.054	0.259	0.299	0.057
1	6.293	0.246	0.347	0.028	0.244	0.342	0.052	0.252	0.354	0.052	0.280	0.536	0.050	0.245	0.345	0.052
2	6.778	0.242	0.349	0.031	0.242	0.347	0.044	0.249	0.355	0.045	0.269	0.523	0.046	0.244	0.348	0.044
3	7.272	0.249	0.440	0.047	0.249	0.434	0.049	0.255	0.446	0.050	0.261	0.457	0.046	0.250	0.437	0.049
4	7.774	0.253	0.344	0.022	0.252	0.342	0.023	0.257	0.353	0.024	0.266	0.461	0.033	0.253	0.345	0.023
5	8.269	0.267	0.380	0.036	0.265	0.375	0.035	0.270	0.386	0.036	0.282	0.466	0.042	0.267	0.378	0.035
6	8.768	0.294	0.426	0.036	0.292	0.424	0.036	0.295	0.424	0.036	0.305	0.459	0.041	0.292	0.424	0.036
7	9.250	0.345	0.488	0.055	0.345	0.490	0.057	0.346	0.485	0.056	0.356	0.494	0.055	0.345	0.489	0.056
8	9.725	0.425	0.582	0.077	0.435	0.595	0.079	0.434	0.591	0.079	0.439	0.594	0.081	0.435	0.594	0.079
9	10.191	0.539	0.701	0.106	0.552	0.714	0.114	0.551	0.712	0.114	0.549	0.724	0.108	0.551	0.713	0.114
10	10.680	0.796	1.198	0.297	0.822	1.225	0.303	0.822	1.225	0.303	0.819	1.185	0.298	0.822	1.225	0.303
11	11.217	1.066	1.209	0.588	1.095	1.244	0.604	1.094	1.244	0.604	1.177	1.296	0.631	1.095	1.244	0.604

Plotting Functions¶

Pipeline Definitions¶

Fitting One by One¶