十二、多元线性回归模型
大约 2 分钟
让我们以不同股票市场的历史数据为基础,模拟构建SPY交易模型的过程
import pandas as pd
import statsmodels.formula.api as smf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
# import all stock market data into DataFrame
aord = pd.read_csv('data/indice/ALLOrdinary.csv')
nikkei = pd.read_csv('data/indice/Nikkei225.csv')
hsi = pd.read_csv('data/indice/HSI.csv')
daxi = pd.read_csv('data/indice/DAXI.csv')
cac40 = pd.read_csv('data/indice/CAC40.csv')
sp500 = pd.read_csv('data/indice/SP500.csv')
dji = pd.read_csv('data/indice/DJI.csv')
nasdaq = pd.read_csv('data/indice/nasdaq_composite.csv')
spy = pd.read_csv('data/indice/SPY.csv')
nasdaq.head()

Step 1: Data Munging
# Due to the timezone issues, we extract and calculate appropriate stock market data for analysis
# Indicepanel is the DataFrame of our trading model
indicepanel=pd.DataFrame(index=spy.index)
indicepanel['spy']=spy['Open'].shift(-1)-spy['Open']
indicepanel['spy_lag1']=indicepanel['spy'].shift(1)
indicepanel['sp500']=sp500["Open"]-sp500['Open'].shift(1)
indicepanel['nasdaq']=nasdaq['Open']-nasdaq['Open'].shift(1)
indicepanel['dji']=dji['Open']-dji['Open'].shift(1)
indicepanel['cac40']=cac40['Open']-cac40['Open'].shift(1)
indicepanel['daxi']=daxi['Open']-daxi['Open'].shift(1)
indicepanel['aord']=aord['Close']-aord['Open']
indicepanel['hsi']=hsi['Close']-hsi['Open']
indicepanel['nikkei']=nikkei['Close']-nikkei['Open']
indicepanel['Price']=spy['Open']
indicepanel.head()

# Lets check whether do we have NaN values in indicepanel
indicepanel.isnull().sum()
spy 1
spy_lag1 1
sp500 1
nasdaq 1
dji 1
cac40 3
daxi 11
aord 2
hsi 57
nikkei 57
Price 0
dtype: int64
# We can use method 'fillna()' from dataframe to forward filling the Nan values
# Then we can drop the reminding Nan values
indicepanel = indicepanel.fillna(method='ffill')
indicepanel = indicepanel.dropna()
# Lets check whether do we have Nan values in indicepanel now
indicepanel.isnull().sum()
spy 0
spy_lag1 0
sp500 0
nasdaq 0
dji 0
cac40 0
daxi 0
aord 0
hsi 0
nikkei 0
Price 0
dtype: int64
# save this indicepanel for part 4.5
path_save = 'data/indice/indicepanel.csv'
indicepanel.to_csv(path_save)
print(indicepanel.shape)
(2678, 11)
Step 2: Data Spliting
#split the data into (1)train set and (2)test set
Train = indicepanel.iloc[-2000:-1000, :]
Test = indicepanel.iloc[-1000:, :]
print(Train.shape, Test.shape)
(1000, 11) (1000, 11)
Step 3: Explore the train data set
# Generate scatter matrix among all stock markets (and the price of SPY) to observe the association
from pandas.plotting import scatter_matrix
sm = scatter_matrix(Train, figsize=(10, 10))

Step 4: Check the correlation of each index between spy
# Find the indice with largest correlation
corr_array = Train.iloc[:, :-1].corr()['spy']
print(corr_array)
spy 1.000000
spy_lag1 -0.011623
sp500 -0.018632
nasdaq 0.012333
dji -0.037097
cac40 0.076886
daxi 0.019410
aord 0.048200
hsi -0.038361
nikkei 0.035379
Name: spy, dtype: float64
formula = 'spy~spy_lag1+sp500+nasdaq+dji+cac40+aord+daxi+nikkei+hsi'
lm = smf.ols(formula=formula, data=Train).fit()
lm.summary()

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Step 5: Make prediction
Train['PredictedY'] = lm.predict(Train)
Test['PredictedY'] = lm.predict(Test)
plt.scatter(Train['spy'], Train['PredictedY'])
<matplotlib.collections.PathCollection at 0x210353c2a50>
Step 6: Model evaluation - Statistical standard
We can measure the performance of our model using some statistical metrics - RMSE, Adjusted 𝑅2�2
# RMSE - Root Mean Squared Error, Adjusted R^2
def adjustedMetric(data, model, model_k, yname):
data['yhat'] = model.predict(data)
SST = ((data[yname] - data[yname].mean())**2).sum()
SSR = ((data['yhat'] - data[yname].mean())**2).sum()
SSE = ((data[yname] - data['yhat'])**2).sum()
r2 = SSR/SST
adjustR2 = 1 - (1-r2)*(data.shape[0] - 1)/(data.shape[0] -model_k -1)
RMSE = (SSE/(data.shape[0] -model_k -1))**0.5
return adjustR2, RMSE
def assessTable(test, train, model, model_k, yname):
r2test, RMSEtest = adjustedMetric(test, model, model_k, yname)
r2train, RMSEtrain = adjustedMetric(train, model, model_k, yname)
assessment = pd.DataFrame(index=['R2', 'RMSE'], columns=['Train', 'Test'])
assessment['Train'] = [r2train, RMSEtrain]
assessment['Test'] = [r2test, RMSEtest]
return assessment
# Get the assement table fo our model
assessTable(Test, Train, lm, 9, 'spy')