To determine if the magnetic field experienced by satellites can be predicted from their altitude from Earth.
We can reasonably assume that the magnetic field will be less intense as the altitude increases. However, because the Earth's geomagnetic field is not perfectly spherical but instead in the shape of a dipole, with anomalies and distortions from the pressure of the interplanetary magnetic field, the relationship between these two attributes might not be easily modeled.
MOST-268_HD209458_2014-268_HD209458_2014
A .tar file containing .fits files compressed as .tar files.
This dataset is available online: http://www.cadc-ccda.hia-iha.nrc-cnrc.gc.ca/en/search/?collection=MOST&noexec=true#resultTableTab
The data was recorded by the MOST satellite: http://www.asc-csa.gc.ca/fra/satellites/most/default.asp
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import tarfile
import logging
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics
from astropy.io import fits
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs
from plotly.offline import init_notebook_mode
from plotly.offline import plot
from plotly.offline import iplot
init_notebook_mode(connected=True)
cf.set_config_file(offline=True, world_readable=True, theme="ggplot")
plt.style.use("ggplot")
%matplotlib inline
# from highest to lowest: WARNING, INFO, DEBUG, NOTSET
logging.basicConfig(level=logging.WARNING)
def evaluate_model(y_actual, y_predicted, regression_obj):
"""
Evaluates a regression model
parameter y_actual: real values
parameter y_predicted: predicted values
return: dataframe of the results
"""
r_cross = cross_val_score(regression_obj, x_data, y_data, cv=4)
results = {
"Mean Absolute Error": metrics.mean_absolute_error(y_actual, y_predicted),
"Mean Squared Error": metrics.mean_squared_error(y_actual, y_predicted),
"Median Absolute Error": metrics.median_absolute_error(y_actual, y_predicted),
"Explained Variance Score": metrics.explained_variance_score(y_actual, y_predicted),
"R Squared Score": metrics.r2_score(y_actual, y_predicted),
"Cross-Validation Score Mean": r_cross.mean(),
"Cross-Validation Score Std": r_cross.std(),
}
return pd.DataFrame(data=results, index=[0, ])
def distribution_plot(red_function, blue_function):
"""
Creates a distribution plot
parameter red_function: predictor values
parameter blue_function: target values
return: None
"""
sns.set_style("ticks")
fig, ax = plt.subplots(figsize=(14, 8))
plt.title("Generalization Error")
ax1 = sns.distplot(
red_function,
hist=False,
color="r",
label="Actual Values"
)
sns.distplot(
blue_function,
hist=False,
color="b",
label="Predicted Values",
ax=ax1
)
sns.despine()
# locate the dataset directory relative to this notebook
notebook_path = os.path.abspath("predict_magnetic_field.ipynb")
tar_dir = os.path.join(os.path.dirname(os.path.dirname(notebook_path)), "datasets/")
logging.debug("Datasets directory: {}".format(tar_dir))
data = []
for root, dirs, files in os.walk(tar_dir):
for f in files:
# for every .tar file in the datasets directory
if os.path.splitext(f)[1] == ".tar":
logging.info("Processing file: {}".format(f))
# open .tar_file
with tarfile.open(
name=os.path.join(root, f),
mode="r"
) as tar_obj:
# for every file in the tar file
for member in tar_obj.getnames():
if os.path.splitext(member)[1] == ".fits":
# extract .tar file in memory
logging.debug("Extracting file: {}".format(member))
extracted = tar_obj.extractfile(member)
# open extracted .fits file
with fits.open(extracted) as hdul:
hdr = hdul[0].header
data_dct = {
"Exposure Date": hdr["DATE-OBS"],
"Magnetic Field": hdr["MAG_FLD"],
"Altitude": hdr["SAT_ALT"],
}
data.append(data_dct)
# create a dataframe from the list of data dictionaries
df = pd.DataFrame(data)
# set exposure date as index
df["Exposure Date"] = pd.to_datetime(df["Exposure Date"], format="%Y-%m-%dT%H:%M:%S.000")
df.set_index(keys="Exposure Date", inplace=True)
df.sort_index(axis=0, ascending=True, inplace=True)
# sanity check
df.head()
df.info()
df.describe()
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8))
df["Altitude"].plot(kind="line", color="b", style=".", ax=ax1, legend=False)
ax1.xaxis.set_label_text("")
ax1.yaxis.set_label_text("M")
ax1.set_title("Altitude Over Time")
df["Magnetic Field"].plot(kind="line", color="g", style=".", ax=ax2)
ax2.yaxis.set_label_text("nT")
ax2.set_title("Magnetic Field Over Time")
fig.subplots_adjust(hspace=0.5)
fig.suptitle("Altitude and Magnetic Field Values Over Time", fontsize=16)
plt.show()
names = df.columns
scaler = StandardScaler()
# fit the data on the scaler object
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns=names)
scaled_df.iplot(
xTitle="Exposure Date",
yTitle="Values",
title="Standardized Altitude and Magnetic Field Values Over Time",
colors=["blue", "green"],
)
The line plot gives us an eagle eye's view of the data. We can see at first glance that there seems to be a relationship between the altitude of the satellite and the magnetic field: when the altitude is low, the magnetic field is stronger.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 8))
alt_count, alt_bin_edges = np.histogram(df["Altitude"])
df["Altitude"].plot(kind="hist", color="b", xticks=alt_bin_edges, ax=ax1)
ax1.yaxis.set_label_text("M (Standardized)")
ax1.set_title("Altitude")
ax1.tick_params(axis="x", labelrotation=90)
mag_count, mag_bin_edges = np.histogram(df["Magnetic Field"])
df["Magnetic Field"].plot(kind="hist", color="g", xticks=mag_bin_edges, ax=ax2)
ax2.yaxis.set_label_text("nT (Standardized)")
ax2.set_title("Magnetic Field")
ax2.tick_params(axis="x", labelrotation=90)
fig.subplots_adjust(hspace=0.5)
fig.suptitle("Histograms of the Altitude and Magnetic Field Values", fontsize=16)
plt.show()
The almost mirror image displayed by these histograms seems to confirm the relationship between the two attributes.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 8))
df["Altitude"].plot(kind="box", notch=True, patch_artist=True, color="b", ax=ax1)
ax1.yaxis.set_label_text("M")
df["Magnetic Field"].plot(kind="box", notch=True, patch_artist=True, color="g", ax=ax2)
ax2.yaxis.set_label_text("nT")
fig.suptitle("Distribution of Values", fontsize=16)
plt.show()
These box plots show values that are mostly homogeneous, without any outliers.
sns.set_style("ticks")
fig, ax = plt.subplots(figsize=(14, 8))
plt.title("Altitude vs Magnetic Field")
sns.regplot(
x="Altitude",
y="Magnetic Field",
data=scaled_df,
ax=ax,
marker="+",
)
sns.despine()
The fitted line represents the predicted value; its diagonal suggest that there is a certain relationship between the two features. We can see that the actual data points are close to the line at low altitudes, but then drift downwards then up in a larger pattern as the altitude increases. This might indicate that the relationship between the two features is non-linear.
"""
Correlation coefficient:
Close to +1: Large positive relationship
Close to -1: Large negative relationship
Close to 0: No relationship
P-value:
< 0.001: Strong certainty in the results
< 0.05: Moderate certainty in the results
< 0.1: Weak certainty in the results
> 0.1: No certainty in the results
"""
pearson_coef, p_value = stats.pearsonr(scaled_df["Altitude"], scaled_df["Magnetic Field"])
print("Coefficient: {}\nP-Value: {}".format(pearson_coef, p_value))
The value of the coefficient indicates that there is a moderate to strong relationship between the two features, and the p-value represents a very high certainty of this interpretation.
sns.set_style("ticks")
fig, ax = plt.subplots(figsize=(14, 8))
plt.title("Altitude over Magnetic Field")
sns.residplot(
x="Altitude",
y="Magnetic Field",
data=scaled_df,
ax=ax,
)
sns.despine()
Here the values of the error change with the altitude; the residuals are not randomly seperated. This suggests the linear assumption is incorrect; we can apply a polynomial transformation to make the model more flexible.
# split the predictor and target values into two dataframes
y_data = scaled_df["Magnetic Field"]
x_data = scaled_df.drop("Magnetic Field", axis=1)
# randomly split data into training and testing groups
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=9)
print("Number of training samples: {}".format(x_train.shape[0]))
print("Number of test samples: {}".format(x_test.shape[0]))
lr1 = LinearRegression()
lr1.fit(x_train[["Altitude"]], y_train)
# predicting the output
yhat_linear = cross_val_predict(lr1, x_test, y_test, cv=4)
distribution_plot(scaled_df["Magnetic Field"], yhat_linear)
As suspected, a simple linear regression model performs poorly.
df_linear = evaluate_model(x_test, yhat_linear, lr1)
df_linear["Type"] = "Linear"
df_linear
# calculating the best degree of polynomial features
order = range(1, 11)
lr2 = LinearRegression()
r_squ_test = []
best_degree_dct = {}
for n in order:
pr = PolynomialFeatures(degree=n)
x_train_pr = pr.fit_transform(x_train[["Altitude"]])
x_test_pr = pr.fit_transform(x_test[["Altitude"]])
lr2.fit(x_train_pr, y_train)
r_squ = lr2.score(x_test_pr, y_test)
r_squ_test.append(r_squ)
best_degree_dct[r_squ] = n
key = max(best_degree_dct.keys())
best_degree = round(best_degree_dct[key])
print("Best degree of polynomial features: {}".format(best_degree))
plt.figure(figsize=(14, 8))
plt.plot(order, r_squ_test)
plt.xlabel("Order")
plt.ylabel("R^2")
plt.title("R^2 Using Test Data")
plt.show()
pipeline_input = [
("polynomial", PolynomialFeatures(degree=best_degree)),
("model", LinearRegression()),
]
pipe = Pipeline(pipeline_input)
pipe.fit(x_train, y_train)
# predicting the output
yhat_polynomial = pipe.predict(x_test)
distribution_plot(scaled_df["Magnetic Field"], yhat_polynomial)
df_polynomial = evaluate_model(x_test, yhat_polynomial, pipe)
df_polynomial["Type"] = "Polynomial"
df_polynomial
df_results = pd.concat([df_linear, df_polynomial], axis=0, ignore_index=False)
df_results.set_index("Type")
fig, axs = plt.subplots(3, 3, figsize=(14, 8))
colors = ["orange", "green"]
labels = ("Linear", "Polynomial")
# errors
df_results["Mean Absolute Error"].plot(
kind="barh",
title="Mean Absolute Error",
color=colors,
ax=axs[0, 0]
).set_yticklabels(labels)
df_results["Mean Squared Error"].plot(
kind="barh",
title="Mean Squared Error",
color=colors,
ax=axs[0, 1]
).set_yticklabels(labels)
df_results["Median Absolute Error"].plot(
kind="barh",
title="Median Absolute Error",
color=colors,
ax=axs[0, 2]
).set_yticklabels(labels)
# scores
df_results["Explained Variance Score"].plot(
kind="barh",
title="Explained Variance Score",
color=colors,
ax=axs[1, 0]
).set_yticklabels(labels)
df_results["R Squared Score"].plot(
kind="barh",
title="R Squared Score",
color=colors,
ax=axs[1, 1]
).set_yticklabels(labels)
# cross-validation
df_results["Cross-Validation Score Mean"].plot(
kind="barh",
title="Cross-Validation Score Mean",
color=colors,
ax=axs[2, 0]
).set_yticklabels(labels)
df_results["Cross-Validation Score Std"].plot(
kind="barh",
title="Cross-Validation Score Std",
color=colors,
ax=axs[2, 1]
).set_yticklabels(labels)
axs[1, 2].set_axis_off()
axs[2, 2].set_axis_off()
fig.subplots_adjust(hspace=0.5, wspace=0.4)
fig.suptitle("Linear vs Polynomial Evaluation", fontsize=16)
plt.show()
The regression plot and Pearson correlation seem to indicate that there is indeed a relationship between the altitude of the satellite and the intensity of the magnetic field it experiences.
However, the revised distribution plot from the polynomial regression still displays some significant differences from the actual values, especially when the magnetic field is less intense.
A possible explanation for this is that the higher the satellite is from Earth, the less it is envelopped by its magnetosphere, and thus other factors, such as charged particles present in the solar wind, become more significant.