Predicting Magnetic Field Values for Satellites¶

Author: Simon Lachaîne

simonthechain@gmail.com

Objective¶

To determine if the magnetic field experienced by satellites can be predicted from their altitude from Earth.

Hypothesis¶

We can reasonably assume that the magnetic field will be less intense as the altitude increases. However, because the Earth's geomagnetic field is not perfectly spherical but instead in the shape of a dipole, with anomalies and distortions from the pressure of the interplanetary magnetic field, the relationship between these two attributes might not be easily modeled.

Dataset¶

MOST-268_HD209458_2014-268_HD209458_2014

A .tar file containing .fits files compressed as .tar files.

This dataset is available online: http://www.cadc-ccda.hia-iha.nrc-cnrc.gc.ca/en/search/?collection=MOST&noexec=true#resultTableTab

The data was recorded by the MOST satellite: http://www.asc-csa.gc.ca/fra/satellites/most/default.asp

Initialization¶

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import tarfile
import logging

import numpy as np
import pandas as pd

from scipy import stats

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics

from astropy.io import fits

import matplotlib.pyplot as plt
import seaborn as sns

import cufflinks as cf

import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs
from plotly.offline import init_notebook_mode
from plotly.offline import plot
from plotly.offline import iplot

init_notebook_mode(connected=True)
cf.set_config_file(offline=True, world_readable=True, theme="ggplot")

plt.style.use("ggplot")
%matplotlib inline

# from highest to lowest: WARNING, INFO, DEBUG, NOTSET
logging.basicConfig(level=logging.WARNING)

def evaluate_model(y_actual, y_predicted, regression_obj):
    """
    Evaluates a regression model
    parameter y_actual: real values
    parameter y_predicted: predicted values
    return: dataframe of the results
    """
    r_cross = cross_val_score(regression_obj, x_data, y_data, cv=4)
    
    results = {
        "Mean Absolute Error": metrics.mean_absolute_error(y_actual, y_predicted),
        "Mean Squared Error": metrics.mean_squared_error(y_actual, y_predicted),
        "Median Absolute Error": metrics.median_absolute_error(y_actual, y_predicted),
        "Explained Variance Score": metrics.explained_variance_score(y_actual, y_predicted),
        "R Squared Score": metrics.r2_score(y_actual, y_predicted),
        "Cross-Validation Score Mean": r_cross.mean(),
        "Cross-Validation Score Std": r_cross.std(),
    }
    
    return pd.DataFrame(data=results, index=[0, ])

def distribution_plot(red_function, blue_function):
    """
    Creates a distribution plot
    parameter red_function: predictor values
    parameter blue_function: target values
    return: None
    """
    sns.set_style("ticks")
    fig, ax = plt.subplots(figsize=(14, 8))

    plt.title("Generalization Error")

    ax1 = sns.distplot(
        red_function,
        hist=False,
        color="r",
        label="Actual Values"
    )
    sns.distplot(
        blue_function,
        hist=False,
        color="b",
        label="Predicted Values",
        ax=ax1
    )
    sns.despine()

Loading Data¶

# locate the dataset directory relative to this notebook
notebook_path = os.path.abspath("predict_magnetic_field.ipynb")
tar_dir = os.path.join(os.path.dirname(os.path.dirname(notebook_path)), "datasets/")
logging.debug("Datasets directory: {}".format(tar_dir))

data = []

for root, dirs, files in os.walk(tar_dir):
    for f in files:
        
        # for every .tar file in the datasets directory
        if os.path.splitext(f)[1] == ".tar":
            logging.info("Processing file: {}".format(f))
            
            # open .tar_file
            with tarfile.open(
                name=os.path.join(root, f),
                mode="r"
            ) as tar_obj:
                
                # for every file in the tar file
                for member in tar_obj.getnames():
                    if os.path.splitext(member)[1] == ".fits":
                        
                        # extract .tar file in memory
                        logging.debug("Extracting file: {}".format(member))
                        extracted = tar_obj.extractfile(member)
                        
                        # open extracted .fits file
                        with fits.open(extracted) as hdul:
                            hdr = hdul[0].header
                            
                            data_dct = {
                                "Exposure Date": hdr["DATE-OBS"],
                                "Magnetic Field": hdr["MAG_FLD"],
                                "Altitude": hdr["SAT_ALT"],
                            }
                            data.append(data_dct)

# create a dataframe from the list of data dictionaries
df = pd.DataFrame(data)

Data Preprocessing¶

# set exposure date as index
df["Exposure Date"] = pd.to_datetime(df["Exposure Date"], format="%Y-%m-%dT%H:%M:%S.000")
df.set_index(keys="Exposure Date", inplace=True)
df.sort_index(axis=0, ascending=True, inplace=True)

Exploratory Data Analysis¶

# sanity check
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6708 entries, 2014-08-19 20:11:35 to 2014-09-10 01:30:28
Data columns (total 2 columns):
Altitude          6708 non-null float64
Magnetic Field    6708 non-null float64
dtypes: float64(2)
memory usage: 157.2 KB

df.describe()

Line Plot¶

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8))

df["Altitude"].plot(kind="line", color="b", style=".", ax=ax1, legend=False)
ax1.xaxis.set_label_text("")
ax1.yaxis.set_label_text("M")
ax1.set_title("Altitude Over Time")

df["Magnetic Field"].plot(kind="line", color="g", style=".", ax=ax2)
ax2.yaxis.set_label_text("nT")
ax2.set_title("Magnetic Field Over Time")

fig.subplots_adjust(hspace=0.5)
fig.suptitle("Altitude and Magnetic Field Values Over Time", fontsize=16)

plt.show()

Interactive Line Plot with Standardization¶

names = df.columns
scaler = StandardScaler()

# fit the data on the scaler object
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns=names)

scaled_df.iplot(
    xTitle="Exposure Date",
    yTitle="Values",
    title="Standardized Altitude and Magnetic Field Values Over Time",
    colors=["blue", "green"],
)

The line plot gives us an eagle eye's view of the data. We can see at first glance that there seems to be a relationship between the altitude of the satellite and the magnetic field: when the altitude is low, the magnetic field is stronger.

Histogram¶

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 8))

alt_count, alt_bin_edges = np.histogram(df["Altitude"])
df["Altitude"].plot(kind="hist", color="b", xticks=alt_bin_edges, ax=ax1)
ax1.yaxis.set_label_text("M (Standardized)")
ax1.set_title("Altitude")
ax1.tick_params(axis="x", labelrotation=90)

mag_count, mag_bin_edges = np.histogram(df["Magnetic Field"])
df["Magnetic Field"].plot(kind="hist", color="g", xticks=mag_bin_edges, ax=ax2)
ax2.yaxis.set_label_text("nT (Standardized)")
ax2.set_title("Magnetic Field")
ax2.tick_params(axis="x", labelrotation=90)

fig.subplots_adjust(hspace=0.5)
fig.suptitle("Histograms of the Altitude and Magnetic Field Values", fontsize=16)
plt.show()

The almost mirror image displayed by these histograms seems to confirm the relationship between the two attributes.

Box Plot¶

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 8))

df["Altitude"].plot(kind="box", notch=True, patch_artist=True, color="b", ax=ax1)
ax1.yaxis.set_label_text("M")

df["Magnetic Field"].plot(kind="box", notch=True, patch_artist=True, color="g", ax=ax2)
ax2.yaxis.set_label_text("nT")
fig.suptitle("Distribution of Values", fontsize=16)

plt.show()

These box plots show values that are mostly homogeneous, without any outliers.

Regression Plot¶

sns.set_style("ticks")
fig, ax = plt.subplots(figsize=(14, 8))

plt.title("Altitude vs Magnetic Field")
sns.regplot(
    x="Altitude",
    y="Magnetic Field",
    data=scaled_df,
    ax=ax,
    marker="+",
)
sns.despine()

The fitted line represents the predicted value; its diagonal suggest that there is a certain relationship between the two features. We can see that the actual data points are close to the line at low altitudes, but then drift downwards then up in a larger pattern as the altitude increases. This might indicate that the relationship between the two features is non-linear.

Pearson Correlation¶

"""
Correlation coefficient:
Close to +1: Large positive relationship
Close to -1: Large negative relationship
Close to 0: No relationship

P-value:
< 0.001: Strong certainty in the results
< 0.05: Moderate certainty in the results
< 0.1: Weak certainty in the results
> 0.1: No certainty in the results
"""
pearson_coef, p_value = stats.pearsonr(scaled_df["Altitude"], scaled_df["Magnetic Field"])
print("Coefficient: {}\nP-Value: {}".format(pearson_coef, p_value))

Coefficient: -0.5941582377514553
P-Value: 0.0

The value of the coefficient indicates that there is a moderate to strong relationship between the two features, and the p-value represents a very high certainty of this interpretation.

Residual Plot¶

sns.set_style("ticks")
fig, ax = plt.subplots(figsize=(14, 8))

plt.title("Altitude over Magnetic Field")
sns.residplot(
    x="Altitude",
    y="Magnetic Field",
    data=scaled_df,
    ax=ax,
)
sns.despine()

Here the values of the error change with the altitude; the residuals are not randomly seperated. This suggests the linear assumption is incorrect; we can apply a polynomial transformation to make the model more flexible.

Preparing Training and Test Samples¶

# split the predictor and target values into two dataframes
y_data = scaled_df["Magnetic Field"]
x_data = scaled_df.drop("Magnetic Field", axis=1)

# randomly split data into training and testing groups
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=9)

print("Number of training samples: {}".format(x_train.shape[0]))
print("Number of test samples: {}".format(x_test.shape[0]))

Number of training samples: 4695
Number of test samples: 2013

Linear Regression¶

lr1 = LinearRegression()
lr1.fit(x_train[["Altitude"]], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

Predictions¶

# predicting the output
yhat_linear = cross_val_predict(lr1, x_test, y_test, cv=4)

distribution_plot(scaled_df["Magnetic Field"], yhat_linear)

As suspected, a simple linear regression model performs poorly.

Model Evaluation¶

df_linear = evaluate_model(x_test, yhat_linear, lr1)
df_linear["Type"] = "Linear"
df_linear

Polynomial Regression¶

# calculating the best degree of polynomial features
order = range(1, 11)
lr2 = LinearRegression()
r_squ_test = []
best_degree_dct = {}

for n in order:
    pr = PolynomialFeatures(degree=n)
    
    x_train_pr = pr.fit_transform(x_train[["Altitude"]])
    x_test_pr = pr.fit_transform(x_test[["Altitude"]])
    
    lr2.fit(x_train_pr, y_train)
    
    r_squ = lr2.score(x_test_pr, y_test)
    r_squ_test.append(r_squ)
    best_degree_dct[r_squ] = n

key = max(best_degree_dct.keys())
best_degree = round(best_degree_dct[key])

print("Best degree of polynomial features: {}".format(best_degree))

Best degree of polynomial features: 6

plt.figure(figsize=(14, 8))
plt.plot(order, r_squ_test)
plt.xlabel("Order")
plt.ylabel("R^2")
plt.title("R^2 Using Test Data")

plt.show()

pipeline_input = [
    ("polynomial", PolynomialFeatures(degree=best_degree)),
    ("model", LinearRegression()),
]

pipe = Pipeline(pipeline_input)
pipe.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('polynomial', PolynomialFeatures(degree=6, include_bias=True, interaction_only=False)), ('model', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False))])

Predictions¶

# predicting the output
yhat_polynomial = pipe.predict(x_test)

distribution_plot(scaled_df["Magnetic Field"], yhat_polynomial)

Model Evaluation¶

df_polynomial = evaluate_model(x_test, yhat_polynomial, pipe)
df_polynomial["Type"] = "Polynomial"
df_polynomial

Model Comparison¶

df_results = pd.concat([df_linear, df_polynomial], axis=0, ignore_index=False)
df_results.set_index("Type")

fig, axs = plt.subplots(3, 3, figsize=(14, 8))

colors = ["orange", "green"]
labels = ("Linear", "Polynomial")

# errors
df_results["Mean Absolute Error"].plot(
    kind="barh", 
    title="Mean Absolute Error", 
    color=colors,
    ax=axs[0, 0]
).set_yticklabels(labels)
df_results["Mean Squared Error"].plot(
    kind="barh",
    title="Mean Squared Error", 
    color=colors,
    ax=axs[0, 1]
).set_yticklabels(labels)
df_results["Median Absolute Error"].plot(
    kind="barh", 
    title="Median Absolute Error", 
    color=colors,
    ax=axs[0, 2]
).set_yticklabels(labels)

# scores
df_results["Explained Variance Score"].plot(
    kind="barh", 
    title="Explained Variance Score", 
    color=colors,
    ax=axs[1, 0]
).set_yticklabels(labels)
df_results["R Squared Score"].plot(
    kind="barh", 
    title="R Squared Score", 
    color=colors,
    ax=axs[1, 1]
).set_yticklabels(labels)

# cross-validation
df_results["Cross-Validation Score Mean"].plot(
    kind="barh", 
    title="Cross-Validation Score Mean", 
    color=colors,
    ax=axs[2, 0]
).set_yticklabels(labels)
df_results["Cross-Validation Score Std"].plot(
    kind="barh", 
    title="Cross-Validation Score Std", 
    color=colors,
    ax=axs[2, 1]
).set_yticklabels(labels)

axs[1, 2].set_axis_off()
axs[2, 2].set_axis_off()

fig.subplots_adjust(hspace=0.5, wspace=0.4)
fig.suptitle("Linear vs Polynomial Evaluation", fontsize=16)

plt.show()

Conclusion¶

The regression plot and Pearson correlation seem to indicate that there is indeed a relationship between the altitude of the satellite and the intensity of the magnetic field it experiences.

However, the revised distribution plot from the polynomial regression still displays some significant differences from the actual values, especially when the magnetic field is less intense.

A possible explanation for this is that the higher the satellite is from Earth, the less it is envelopped by its magnetosphere, and thus other factors, such as charged particles present in the solar wind, become more significant.

	Altitude	Magnetic Field
Exposure Date
2014-08-19 20:11:35	838931.497	22098.867
2014-08-19 20:12:37	838699.525	21012.742
2014-08-19 20:13:39	838418.956	20113.562
2014-08-19 20:14:41	838088.724	19390.773
2014-08-19 21:19:35	825348.484	27947.868

	Altitude	Magnetic Field
count	6708.000000	6708.000000
mean	819475.459218	33183.963045
std	9643.641820	7281.736133
min	808725.743000	17019.006000
25%	810733.799250	26888.738000
50%	816228.227000	34994.234500
75%	827720.180250	39993.574750
max	839441.271000	44679.224000

	Mean Absolute Error	Mean Squared Error	Median Absolute Error	Explained Variance Score	R Squared Score	Cross-Validation Score Mean	Cross-Validation Score Std
Type
Linear	1.395162	2.560186	1.437215	-1.601353	-1.601356	0.095636	0.637884
Polynomial	1.597531	2.890094	1.869005	-1.936253	-1.936570	0.715032	0.116730