Notebook
In [2]:
import pandas as pd
from pandas import Timedelta as td
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import datetime as dt
from pykalman import KalmanFilter

##### 5) trade the current pattern in this outcome-direction hoping that history repeats¶
In [3]:
start = '2002-1-1'
end= '2015, 12, 1'

#p= pattern length
p = 125
#o = outcome length
o = 25

In [4]:
def ret_index(prices):
'''
return index of 1 dollar invested in that instrument
'''
rets = prices.pct_change()
index = (rets+1).cumprod()
return (index-1)*100

In [5]:
px = get_pricing('SPY', start, end, fields= 'price')
i = len(px.index)-1
#verify
current_close = px.iloc[i]
current_date = px.index[i]
print current_date, current_close
#historical data within to serach for similar patterns
h = px[:i-p]
cp = px.iloc[i-p:i]

2015-12-01 00:00:00+00:00 210.74


# .. 1) done¶

In [6]:
def similar(x):
'''
condition: patterns have to be correlated with pearson correlation
'''
sp_corr_value, sp_corr_pvalue = stats.pearsonr(cp.values,x)

if sp_corr_value < sp_corr_pvalue: # correlation value returned by pearnson has to be grather than p_value
return np.NAN
else:
return sp_corr_value

In [7]:
# apply the correlation function every day
correlation = pd.rolling_apply(h,p,similar)# apply the similar function to each row of the historical dataframe
correlation.name = 'corr'
correlation.dropna(inplace=True)
correlation.plot(style='ro')

Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc59be58890>

# .. 2) done¶

In [8]:
# adjust the correlation dataframe in order to pick just the highest values of correlation BUT
# without any overlapping dates.. se the sext graph with blacks dots
df = pd.DataFrame(correlation).dropna()
df['date']=df.index
df.columns = ['corr', 'date']
df['delta'] = (df['date']-df['date'].shift(1))/pd.Timedelta('1 days')
df['delta'].fillna(p+1, inplace=True)
df['eval'] = df.apply(lambda x: x['date'] if x['delta'] >7 else np.NAN, axis=1)
df.fillna(method = 'pad', inplace=True)
df[:5]

Out[8]:
corr date delta eval
2002-08-23 00:00:00+00:00 0.145151 2002-08-23 126 2002-08-23
2002-08-26 00:00:00+00:00 0.170073 2002-08-26 3 2002-08-23
2002-08-27 00:00:00+00:00 0.194336 2002-08-27 1 2002-08-23
2002-08-28 00:00:00+00:00 0.212647 2002-08-28 1 2002-08-23
2002-08-29 00:00:00+00:00 0.233133 2002-08-29 1 2002-08-23
In [9]:
pat = df['corr'].groupby(df['eval']).apply(lambda x: x.argmax()).values
pat # dates with max correlation values

Out[9]:
array([Timestamp('2002-12-10 00:00:00+0000', tz='UTC'),
Timestamp('2003-05-08 00:00:00+0000', tz='UTC'),
Timestamp('2004-07-21 00:00:00+0000', tz='UTC'),
Timestamp('2004-10-13 00:00:00+0000', tz='UTC'),
Timestamp('2005-06-27 00:00:00+0000', tz='UTC'),
Timestamp('2005-12-29 00:00:00+0000', tz='UTC'),
Timestamp('2006-09-18 00:00:00+0000', tz='UTC'),
Timestamp('2007-06-05 00:00:00+0000', tz='UTC'),
Timestamp('2007-11-01 00:00:00+0000', tz='UTC'),
Timestamp('2008-05-14 00:00:00+0000', tz='UTC'),
Timestamp('2008-09-26 00:00:00+0000', tz='UTC'),
Timestamp('2009-01-16 00:00:00+0000', tz='UTC'),
Timestamp('2009-05-27 00:00:00+0000', tz='UTC'),
Timestamp('2010-05-04 00:00:00+0000', tz='UTC'),
Timestamp('2010-08-30 00:00:00+0000', tz='UTC'),
Timestamp('2011-11-14 00:00:00+0000', tz='UTC'),
Timestamp('2012-08-23 00:00:00+0000', tz='UTC'),
Timestamp('2013-02-06 00:00:00+0000', tz='UTC'),
Timestamp('2014-12-19 00:00:00+0000', tz='UTC')], dtype=object)
In [10]:
max_corr = df['corr'].loc[[_ for _ in pat]]
max_corr.order(ascending=False, inplace=True)
max_corr = max_corr[:10]
pat_names = max_corr.index

In [11]:
# i want the black dots only..
correlation.plot(style='ro', alpha= .2)
max_corr.plot(style= 'ko')

Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc59bdc8710>

# .. 3) done¶

In [12]:
#
#just plot adjustments
#
pat_list = [px.iloc[(i+1)-p:i+1] for i, date in enumerate(px.index) if date in pat_names]
out_list = [px.iloc[(i+1):(i+1)+o] for i, date in enumerate(px.index) if date in pat_names]
df_pat = pd.concat(pat_list, axis= 1)
df_out = pd.concat(out_list, axis=1)
df_pat.columns= pat_names
df_out.columns= pat_names

In [13]:
ri= ret_index(cp).fillna(0).values
for d in pat_names:
q_p = df_pat[d].dropna()
q_o = df_out[d].dropna()
q_p = q_p.reset_index(drop=True)
q_o = q_o.reset_index(drop=True)
r_p= ret_index(q_p).fillna(0)
r_o = ret_index(q_o).fillna(0)
r_p= r_p.values
r_o = r_o.values
plt.plot(range(p), r_p-r_p[-1])
plt.plot([_+(p-1) for _ in range(o)], r_o-r_o[0])
plt.plot(ri-ri[-1], 'k')
plt.show()
df_pat.plot()
plt.show()

In [ ]: