Notebook

Need Help useing sklearn inside of this Custom Factor. Why am I not geting the "computed_accuracy" in the Pipeline results?

In [3]:
#import quantopian.algorithm as algo
from quantopian.pipeline import Pipeline
from quantopian.pipeline import CustomFactor, CustomFilter
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.filters import QTradableStocksUS
from sklearn import svm, cross_validation, neighbors, preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
import numpy as np 
import pandas as pd
import Quandl, math


from quantopian.research import run_pipeline
from quantopian.pipeline.factors import SimpleMovingAverage, AverageDollarVolume


class ML(CustomFactor):
    inputs = [USEquityPricing.close, USEquityPricing.open, USEquityPricing.volume, USEquityPricing.high, USEquityPricing.low]
    #outputs = ('accuracy', 'forecast_out')
    window_length = 246  
    def compute(self, today, assets, open, high, close, Volume, low, out):
        print(type(self.inputs))
        #print(self.inputs)
        
        #df = pd.core.frame.DataFrame([['Adj. Close'],['Adj. High'],['Adj. Volume'],['Adj. Open'],['Adj. Low'],['HL_PCT'],['PCT_change']])
        #df = pd.DataFrame.from_items(self.inputs, index = key)
        for i, assets in enumerate(assets): 

            df = pd.DataFrame.from_items(self.inputs, columns =['Adj. Close', 'Adj. Open', 'Adj. Volume', 'Adj. High', 'Adj. Low']) 
        
        #for i, assets in enumerate(): 
        
            #df['Adj. Close'] = nan(USEquityPricing.close, axis=0)
            #df['Adj. High'] = nan(USEquityPricing.high, axis=0)
            #df['Adj. Volume'] = nan(USEquityPricing.volume, axis=0)
            #df['Adj. Open'] = np.nan(USEquityPricing.open, axis=0)
            #df['Adj. Low'] = np.nan(USEquityPricing.low, axis=0)
        
            #df = pd.core.frame.DataFrame([['Adj. Close'],['Adj. High'],['Adj. Volume'],['Adj. Open'],['Adj. Low'],['HL_PCT'],['PCT_change']])
        
        #df['Adj. Close'] = USEquityPricing.close
        #df['Adj. High'] = USEquityPricing.high
        #df['Adj. Volume'] = USEquityPricing.volume
        #df['Adj. Open'] = USEquityPricing.open
        #df['Adj. Low'] = USEquityPricing.low
        #UnboundLocalError: local variable 'df' referenced before assignment
        
        #df['Adj. Close'] = USEquityPricing.close[0:-1]
        #df['Adj. High'] = USEquityPricing.high[0:-1]
        #df['Adj. Volume'] = USEquityPricing.volume[0:-1]
        #df['Adj. Open'] = USEquityPricing.open[0:-1]
        #df['Adj. Low'] = USEquityPricing.low[0:-1]
        #TypeError: Term.__getitem__() expected a value of type Asset for argument 'key', but got slice instead.
        
            #df['Adj. Close'] = close
            #df['Adj. High'] = high
            #df['Adj. Volume'] = volume
            #df['Adj. Open'] = open
            #df['Adj. Low'] = low
        #UnboundLocalError: local variable 'df' referenced before assignment
        
        #df = df[{'Adj. Close': close} , {'Adj. High':high} , {'Adj. Volume':Volume}]
        #daily_returns = np.diff(close, axis = 0) / close[0:-1]
        
            df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'] * 100.0
            df['PCT_change'] = (df['Adj. Close' ] - df[ 'Adj. Open']) / df['Adj. Open'] * 100.0
            df = df[['Adj. Close' , 'HL_PCT' , 'PCT_change' , 'Adj. Volume']]
        
            forecast_col = 'Adj. Close'
            df.fillna(-99999, inplace=True)
            forecast_out = int(math.ceil(0.01*len(df)))

            df['lable'] = df[forecast_col].shift(-forecast_out)
        
        #df.dropna(inplace=True)
        
            X = np.array(df.drop(['lable'],1))
            X = preprocessing.scale(X)
            X = X[:-forecast_out]
            X_lately = X[-forecast_out:]
        
            df.dropna(inplace=True)
            Y = np.array(df['lable'])
        
            X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2) 
            clf = LinerRegression(n_jobs=-1)
            clf.fit(X_train, Y_train)
            computed_accuracy = clf.score(X_test, Y_test)
            forecast_set = clf.predict(X_lately)
        #print(forcast_set, accuracy, forecast_out)
            df['Forecast'] = np.nan
        
        #out.accuracy[:] = computed_accuracy
        #out.forecast_out[:] = forecast_out

        out[:] = computed_accuracy
        #print(forecast_out)
       
In [4]:
def make_pipeline():
    myPipeline = ML(window_length = 246
           )

    return Pipeline(
        columns={
            'ML_computed_accuracy': myPipeline
        }
    )
#result = run_pipeline(make_pipeline(), '2015-05-05', '2017-05-05')
#result

#result = pipe(make_pipeline(), start_date='2015-11-01', end_date='2017-11-25')
#result.head()

result = run_pipeline(make_pipeline(), '2015-05-05', '2015-05-05')
result

<class 'tuple'>
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-4-d42ce60f04f5> in <module>()
     16 #result.head()
     17 
---> 18 result = run_pipeline(make_pipeline(), '2015-05-05', '2015-05-05')
     19 result

/build/src/qexec_repo/qexec/research/api.py in run_pipeline(pipeline, start_date, end_date, show_progress, chunksize)
    486             show_progress=show_progress,
    487             pipeline_engine=pipeline_engine,
--> 488             holdout_manager=holdout_manager,
    489         )
    490     # The docstring is defined at module scope to get correct indentation.

/build/src/qexec_repo/qexec/research/_api.py in inner_run_pipeline(pipeline, start_date, end_date, show_progress, chunksize, pipeline_engine, holdout_manager)
    735         adjusted_end_date,
    736         chunksize=chunksize,
--> 737         hooks=hooks,
    738     )
    739 

/build/src/qexec_repo/zipline_repo/zipline/pipeline/engine.py in run_chunked_pipeline(self, pipeline, start_date, end_date, chunksize, hooks)
    343         run_pipeline = partial(self._run_pipeline_impl, pipeline, hooks=hooks)
    344         with hooks.running_pipeline(pipeline, start_date, end_date):
--> 345             chunks = [run_pipeline(s, e) for s, e in ranges]
    346 
    347         if len(chunks) == 1:

/build/src/qexec_repo/zipline_repo/zipline/pipeline/engine.py in <listcomp>(.0)
    343         run_pipeline = partial(self._run_pipeline_impl, pipeline, hooks=hooks)
    344         with hooks.running_pipeline(pipeline, start_date, end_date):
--> 345             chunks = [run_pipeline(s, e) for s, e in ranges]
    346 
    347         if len(chunks) == 1:

/build/src/qexec_repo/zipline_repo/zipline/pipeline/engine.py in _run_pipeline_impl(self, pipeline, start_date, end_date, hooks)
    440                 refcounts=refcounts,
    441                 execution_order=execution_order,
--> 442                 hooks=hooks,
    443             )
    444 

/build/src/qexec_repo/zipline_repo/zipline/pipeline/engine.py in compute_chunk(self, graph, dates, sids, workspace, refcounts, execution_order, hooks)
    714                         mask_dates,
    715                         sids,
--> 716                         mask,
    717                     )
    718                 if term.ndim == 2:

/build/src/qexec_repo/zipline_repo/zipline/pipeline/mixins.py in _compute(self, windows, dates, assets, mask)
    219                 inputs = format_inputs(windows, inputs_mask)
    220 
--> 221                 compute(date, masked_assets, out_row, *inputs, **params)
    222                 out[idx][out_mask] = out_row
    223         return out

<ipython-input-3-ee18e5a61d1a> in compute(self, today, assets, _a857d45d0fd154e11b3636e7d243fdb00open, high, close, Volume, low, out)
     28         for i, assets in enumerate(assets):
     29 
---> 30             df = pd.DataFrame.from_items(self.inputs, columns =['Adj. Close', 'Adj. Open', 'Adj. Volume', 'Adj. High', 'Adj. Low'])
     31 
     32         #for i, assets in enumerate():

/venvs/py35/lib/python3.5/site-packages/pandas/core/frame.py in from_items(cls, items, columns, orient)
   1094         frame : DataFrame
   1095         """
-> 1096         keys, values = lzip(*items)
   1097 
   1098         if orient == 'columns':

/venvs/py35/lib/python3.5/site-packages/pandas/compat/__init__.py in lzip(*args, **kwargs)
    113 
    114     def lzip(*args, **kwargs):
--> 115         return list(zip(*args, **kwargs))
    116 
    117     def lmap(*args, **kwargs):

/build/src/qexec_repo/zipline_repo/zipline/pipeline/term.py in __getitem__(self, key)
    238     @expect_types(key=Asset)
    239     def __getitem__(self, key):
--> 240         if isinstance(self, LoadableTerm):
    241             raise NonSliceableTerm(term=self)
    242         return Slice(self, key)

/build/src/qexec_repo/zipline_repo/zipline/utils/input_validation.py in _check(func, argname, argvalue)
    451                     'funcname': get_funcname(func),
    452                     'argname': argname,
--> 453                     'actual': actual(argvalue),
    454                 },
    455             )

TypeError: Term.__getitem__() expected a value of type Asset for argument 'key', but got int instead.
In [2]:
df = Quandl.get('WIKI/GOOGL')
print(df.head())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-bf175233cacd> in <module>()
----> 1 df = Quandl.get('WIKI/GOOGL')
      2 print(df.head())

NameError: name 'Quandl' is not defined
In [49]:
type(df)
Out[49]:
<class 'pandas.core.frame.DataFrame'>
In [50]:
df = df[['Adj. Open','Adj. High','Adj. Low','Adj. Close','Adj. Volume',]]
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0

df = df[['Adj. Close','HL_PCT','PCT_change','Adj. Volume']]

print(df.head())
            Adj. Close    HL_PCT  PCT_change  Adj. Volume
Date                                                     
2004-08-19   50.322842  3.712563    0.324968   44659000.0
2004-08-20   54.322689  0.710922    7.227007   22834300.0
2004-08-23   54.869377  3.729433   -1.227880   18256100.0
2004-08-24   52.597363  6.417469   -5.726357   15247300.0
2004-08-25   53.164113  1.886792    1.183658    9188600.0
In [51]:
forecast_col = 'Adj. Close'
df.fillna(-99999, inplace=True)

forecast_out = int(math.ceil(0.01*len(df)))

df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
print(df.head())
            Adj. Close    HL_PCT  PCT_change  Adj. Volume      label
Date                                                                
2004-08-19   50.322842  3.712563    0.324968   44659000.0  69.078238
2004-08-20   54.322689  0.710922    7.227007   22834300.0  67.839414
2004-08-23   54.869377  3.729433   -1.227880   18256100.0  68.912727
2004-08-24   52.597363  6.417469   -5.726357   15247300.0  70.668146
2004-08-25   53.164113  1.886792    1.183658    9188600.0  71.219849
In [52]:
df = Quandl.get('WIKI/GOOGL')
df = df[['Adj. Open','Adj. High','Adj. Low','Adj. Close','Adj. Volume',]]
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
df = df[['Adj. Close','HL_PCT','PCT_change','Adj. Volume']]
forecast_col = 'Adj. Close'
df.fillna(-99999, inplace=True)

forecast_out = int(math.ceil(0.01*len(df)))

df['label'] = df[forecast_col].shift(-forecast_out)

X = np.array(df.drop(['label'],1))
X = preprocessing.scale(X)
X = X[:-forecast_out]
X_lately = X[-forecast_out:]

df.dropna(inplace=True)
Y = np.array(df['label'])
#Y = np.array(df['label'])


X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2)

clf = LinearRegression()
clf.fit(X_train, Y_train)
accuracy = clf.score(X_test,Y_test)

#print(accuracy)

forceast_set = clf.predict(X_lately)

print(forceast_set, accuracy, forecast_out)
[ 1077.01316071  1091.30692973  1105.23245931  1099.85628233  1093.43829658
  1090.81359664  1089.08218111  1086.02406656  1080.08122365  1075.77139067
  1073.28252274  1093.08453421  1111.51981442  1115.857149    1130.61039326
  1134.65857965  1133.16932905  1130.68897898  1132.42237496  1151.03604213
  1150.60731349  1159.9024576   1156.60899944  1164.30312227  1184.9873132
  1197.05085934  1191.71477122  1203.25213739  1208.60918774  1207.27143836
  1197.99739648  1203.20443751  1202.01849219  1137.63447361  1086.2002661 ] 0.977966788547 35
/venvs/py35/lib/python3.5/site-packages/sklearn/preprocessing/data.py:153: UserWarning: Numerical issues were encountered when centering the data and might not be solved. Dataset may contain too large values. You may need to prescale your features.
  warnings.warn("Numerical issues were encountered "
In [ ]:
 
In [ ]: